1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include <linux/cpu.h>
26#include <linux/cpumask.h>
27#include <linux/cpuset.h>
28#include <linux/err.h>
29#include <linux/errno.h>
30#include <linux/file.h>
31#include <linux/fs.h>
32#include <linux/init.h>
33#include <linux/interrupt.h>
34#include <linux/kernel.h>
35#include <linux/kmod.h>
36#include <linux/list.h>
37#include <linux/mempolicy.h>
38#include <linux/mm.h>
39#include <linux/memory.h>
40#include <linux/export.h>
41#include <linux/mount.h>
42#include <linux/namei.h>
43#include <linux/pagemap.h>
44#include <linux/proc_fs.h>
45#include <linux/rcupdate.h>
46#include <linux/sched.h>
47#include <linux/seq_file.h>
48#include <linux/security.h>
49#include <linux/slab.h>
50#include <linux/spinlock.h>
51#include <linux/stat.h>
52#include <linux/string.h>
53#include <linux/time.h>
54#include <linux/backing-dev.h>
55#include <linux/sort.h>
56
57#include <asm/uaccess.h>
58#include <linux/atomic.h>
59#include <linux/mutex.h>
60#include <linux/workqueue.h>
61#include <linux/cgroup.h>
62#include <linux/wait.h>
63
64struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
65
66
67
68struct fmeter {
69 int cnt;
70 int val;
71 time_t time;
72 spinlock_t lock;
73};
74
75struct cpuset {
76 struct cgroup_subsys_state css;
77
78 unsigned long flags;
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101 cpumask_var_t cpus_allowed;
102 nodemask_t mems_allowed;
103
104
105 cpumask_var_t effective_cpus;
106 nodemask_t effective_mems;
107
108
109
110
111
112
113
114
115
116
117
118 nodemask_t old_mems_allowed;
119
120 struct fmeter fmeter;
121
122
123
124
125
126 int attach_in_progress;
127
128
129 int pn;
130
131
132 int relax_domain_level;
133};
134
135static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
136{
137 return css ? container_of(css, struct cpuset, css) : NULL;
138}
139
140
141static inline struct cpuset *task_cs(struct task_struct *task)
142{
143 return css_cs(task_css(task, cpuset_cgrp_id));
144}
145
146static inline struct cpuset *parent_cs(struct cpuset *cs)
147{
148 return css_cs(cs->css.parent);
149}
150
151#ifdef CONFIG_NUMA
152static inline bool task_has_mempolicy(struct task_struct *task)
153{
154 return task->mempolicy;
155}
156#else
157static inline bool task_has_mempolicy(struct task_struct *task)
158{
159 return false;
160}
161#endif
162
163
164
165typedef enum {
166 CS_ONLINE,
167 CS_CPU_EXCLUSIVE,
168 CS_MEM_EXCLUSIVE,
169 CS_MEM_HARDWALL,
170 CS_MEMORY_MIGRATE,
171 CS_SCHED_LOAD_BALANCE,
172 CS_SPREAD_PAGE,
173 CS_SPREAD_SLAB,
174} cpuset_flagbits_t;
175
176
177static inline bool is_cpuset_online(const struct cpuset *cs)
178{
179 return test_bit(CS_ONLINE, &cs->flags);
180}
181
182static inline int is_cpu_exclusive(const struct cpuset *cs)
183{
184 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
185}
186
187static inline int is_mem_exclusive(const struct cpuset *cs)
188{
189 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
190}
191
192static inline int is_mem_hardwall(const struct cpuset *cs)
193{
194 return test_bit(CS_MEM_HARDWALL, &cs->flags);
195}
196
197static inline int is_sched_load_balance(const struct cpuset *cs)
198{
199 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
200}
201
202static inline int is_memory_migrate(const struct cpuset *cs)
203{
204 return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
205}
206
207static inline int is_spread_page(const struct cpuset *cs)
208{
209 return test_bit(CS_SPREAD_PAGE, &cs->flags);
210}
211
212static inline int is_spread_slab(const struct cpuset *cs)
213{
214 return test_bit(CS_SPREAD_SLAB, &cs->flags);
215}
216
217static struct cpuset top_cpuset = {
218 .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
219 (1 << CS_MEM_EXCLUSIVE)),
220};
221
222
223
224
225
226
227
228
229
230
231#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
232 css_for_each_child((pos_css), &(parent_cs)->css) \
233 if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
234
235
236
237
238
239
240
241
242
243
244
245
246#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
247 css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
248 if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286static DEFINE_MUTEX(cpuset_mutex);
287static DEFINE_SPINLOCK(callback_lock);
288
289
290
291
292static void cpuset_hotplug_workfn(struct work_struct *work);
293static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
294
295static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
296
297
298
299
300
301
302static struct dentry *cpuset_mount(struct file_system_type *fs_type,
303 int flags, const char *unused_dev_name, void *data)
304{
305 struct file_system_type *cgroup_fs = get_fs_type("cgroup");
306 struct dentry *ret = ERR_PTR(-ENODEV);
307 if (cgroup_fs) {
308 char mountopts[] =
309 "cpuset,noprefix,"
310 "release_agent=/sbin/cpuset_release_agent";
311 ret = cgroup_fs->mount(cgroup_fs, flags,
312 unused_dev_name, mountopts);
313 put_filesystem(cgroup_fs);
314 }
315 return ret;
316}
317
318static struct file_system_type cpuset_fs_type = {
319 .name = "cpuset",
320 .mount = cpuset_mount,
321};
322
323
324
325
326
327
328
329
330
331
332
333
334static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
335{
336 while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
337 cs = parent_cs(cs);
338 cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
339}
340
341
342
343
344
345
346
347
348
349
350
351
352static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
353{
354 while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
355 cs = parent_cs(cs);
356 nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
357}
358
359
360
361
362
363
364static void cpuset_update_task_spread_flag(struct cpuset *cs,
365 struct task_struct *tsk)
366{
367 if (is_spread_page(cs))
368 task_set_spread_page(tsk);
369 else
370 task_clear_spread_page(tsk);
371
372 if (is_spread_slab(cs))
373 task_set_spread_slab(tsk);
374 else
375 task_clear_spread_slab(tsk);
376}
377
378
379
380
381
382
383
384
385
386static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
387{
388 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
389 nodes_subset(p->mems_allowed, q->mems_allowed) &&
390 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
391 is_mem_exclusive(p) <= is_mem_exclusive(q);
392}
393
394
395
396
397
398static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
399{
400 struct cpuset *trial;
401
402 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
403 if (!trial)
404 return NULL;
405
406 if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
407 goto free_cs;
408 if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
409 goto free_cpus;
410
411 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
412 cpumask_copy(trial->effective_cpus, cs->effective_cpus);
413 return trial;
414
415free_cpus:
416 free_cpumask_var(trial->cpus_allowed);
417free_cs:
418 kfree(trial);
419 return NULL;
420}
421
422
423
424
425
426static void free_trial_cpuset(struct cpuset *trial)
427{
428 free_cpumask_var(trial->effective_cpus);
429 free_cpumask_var(trial->cpus_allowed);
430 kfree(trial);
431}
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453static int validate_change(struct cpuset *cur, struct cpuset *trial)
454{
455 struct cgroup_subsys_state *css;
456 struct cpuset *c, *par;
457 int ret;
458
459 rcu_read_lock();
460
461
462 ret = -EBUSY;
463 cpuset_for_each_child(c, css, cur)
464 if (!is_cpuset_subset(c, trial))
465 goto out;
466
467
468 ret = 0;
469 if (cur == &top_cpuset)
470 goto out;
471
472 par = parent_cs(cur);
473
474
475 ret = -EACCES;
476 if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
477 goto out;
478
479
480
481
482
483 ret = -EINVAL;
484 cpuset_for_each_child(c, css, par) {
485 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
486 c != cur &&
487 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
488 goto out;
489 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
490 c != cur &&
491 nodes_intersects(trial->mems_allowed, c->mems_allowed))
492 goto out;
493 }
494
495
496
497
498
499 ret = -ENOSPC;
500 if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) {
501 if (!cpumask_empty(cur->cpus_allowed) &&
502 cpumask_empty(trial->cpus_allowed))
503 goto out;
504 if (!nodes_empty(cur->mems_allowed) &&
505 nodes_empty(trial->mems_allowed))
506 goto out;
507 }
508
509
510
511
512
513 ret = -EBUSY;
514 if (is_cpu_exclusive(cur) &&
515 !cpuset_cpumask_can_shrink(cur->cpus_allowed,
516 trial->cpus_allowed))
517 goto out;
518
519 ret = 0;
520out:
521 rcu_read_unlock();
522 return ret;
523}
524
525#ifdef CONFIG_SMP
526
527
528
529
530static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
531{
532 return cpumask_intersects(a->effective_cpus, b->effective_cpus);
533}
534
535static void
536update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
537{
538 if (dattr->relax_domain_level < c->relax_domain_level)
539 dattr->relax_domain_level = c->relax_domain_level;
540 return;
541}
542
543static void update_domain_attr_tree(struct sched_domain_attr *dattr,
544 struct cpuset *root_cs)
545{
546 struct cpuset *cp;
547 struct cgroup_subsys_state *pos_css;
548
549 rcu_read_lock();
550 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
551
552 if (cpumask_empty(cp->cpus_allowed)) {
553 pos_css = css_rightmost_descendant(pos_css);
554 continue;
555 }
556
557 if (is_sched_load_balance(cp))
558 update_domain_attr(dattr, cp);
559 }
560 rcu_read_unlock();
561}
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617static int generate_sched_domains(cpumask_var_t **domains,
618 struct sched_domain_attr **attributes)
619{
620 struct cpuset *cp;
621 struct cpuset **csa;
622 int csn;
623 int i, j, k;
624 cpumask_var_t *doms;
625 cpumask_var_t non_isolated_cpus;
626 struct sched_domain_attr *dattr;
627 int ndoms = 0;
628 int nslot;
629 struct cgroup_subsys_state *pos_css;
630
631 doms = NULL;
632 dattr = NULL;
633 csa = NULL;
634
635 if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
636 goto done;
637 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
638
639
640 if (is_sched_load_balance(&top_cpuset)) {
641 ndoms = 1;
642 doms = alloc_sched_domains(ndoms);
643 if (!doms)
644 goto done;
645
646 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
647 if (dattr) {
648 *dattr = SD_ATTR_INIT;
649 update_domain_attr_tree(dattr, &top_cpuset);
650 }
651 cpumask_and(doms[0], top_cpuset.effective_cpus,
652 non_isolated_cpus);
653
654 goto done;
655 }
656
657 csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);
658 if (!csa)
659 goto done;
660 csn = 0;
661
662 rcu_read_lock();
663 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
664 if (cp == &top_cpuset)
665 continue;
666
667
668
669
670
671
672
673
674 if (!cpumask_empty(cp->cpus_allowed) &&
675 !(is_sched_load_balance(cp) &&
676 cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
677 continue;
678
679 if (is_sched_load_balance(cp))
680 csa[csn++] = cp;
681
682
683 pos_css = css_rightmost_descendant(pos_css);
684 }
685 rcu_read_unlock();
686
687 for (i = 0; i < csn; i++)
688 csa[i]->pn = i;
689 ndoms = csn;
690
691restart:
692
693 for (i = 0; i < csn; i++) {
694 struct cpuset *a = csa[i];
695 int apn = a->pn;
696
697 for (j = 0; j < csn; j++) {
698 struct cpuset *b = csa[j];
699 int bpn = b->pn;
700
701 if (apn != bpn && cpusets_overlap(a, b)) {
702 for (k = 0; k < csn; k++) {
703 struct cpuset *c = csa[k];
704
705 if (c->pn == bpn)
706 c->pn = apn;
707 }
708 ndoms--;
709 goto restart;
710 }
711 }
712 }
713
714
715
716
717
718 doms = alloc_sched_domains(ndoms);
719 if (!doms)
720 goto done;
721
722
723
724
725
726 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
727
728 for (nslot = 0, i = 0; i < csn; i++) {
729 struct cpuset *a = csa[i];
730 struct cpumask *dp;
731 int apn = a->pn;
732
733 if (apn < 0) {
734
735 continue;
736 }
737
738 dp = doms[nslot];
739
740 if (nslot == ndoms) {
741 static int warnings = 10;
742 if (warnings) {
743 pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
744 nslot, ndoms, csn, i, apn);
745 warnings--;
746 }
747 continue;
748 }
749
750 cpumask_clear(dp);
751 if (dattr)
752 *(dattr + nslot) = SD_ATTR_INIT;
753 for (j = i; j < csn; j++) {
754 struct cpuset *b = csa[j];
755
756 if (apn == b->pn) {
757 cpumask_or(dp, dp, b->effective_cpus);
758 cpumask_and(dp, dp, non_isolated_cpus);
759 if (dattr)
760 update_domain_attr_tree(dattr + nslot, b);
761
762
763 b->pn = -1;
764 }
765 }
766 nslot++;
767 }
768 BUG_ON(nslot != ndoms);
769
770done:
771 free_cpumask_var(non_isolated_cpus);
772 kfree(csa);
773
774
775
776
777
778 if (doms == NULL)
779 ndoms = 1;
780
781 *domains = doms;
782 *attributes = dattr;
783 return ndoms;
784}
785
786
787
788
789
790
791
792
793
794
795
796
797static void rebuild_sched_domains_locked(void)
798{
799 struct sched_domain_attr *attr;
800 cpumask_var_t *doms;
801 int ndoms;
802
803 lockdep_assert_held(&cpuset_mutex);
804 get_online_cpus();
805
806
807
808
809
810
811 if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
812 goto out;
813
814
815 ndoms = generate_sched_domains(&doms, &attr);
816
817
818 partition_sched_domains(ndoms, doms, attr);
819out:
820 put_online_cpus();
821}
822#else
823static void rebuild_sched_domains_locked(void)
824{
825}
826#endif
827
828void rebuild_sched_domains(void)
829{
830 mutex_lock(&cpuset_mutex);
831 rebuild_sched_domains_locked();
832 mutex_unlock(&cpuset_mutex);
833}
834
835
836
837
838
839
840
841
842
843static void update_tasks_cpumask(struct cpuset *cs)
844{
845 struct css_task_iter it;
846 struct task_struct *task;
847
848 css_task_iter_start(&cs->css, &it);
849 while ((task = css_task_iter_next(&it)))
850 set_cpus_allowed_ptr(task, cs->effective_cpus);
851 css_task_iter_end(&it);
852}
853
854
855
856
857
858
859
860
861
862
863
864
865
866static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
867{
868 struct cpuset *cp;
869 struct cgroup_subsys_state *pos_css;
870 bool need_rebuild_sched_domains = false;
871
872 rcu_read_lock();
873 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
874 struct cpuset *parent = parent_cs(cp);
875
876 cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
877
878
879
880
881
882 if (cgroup_on_dfl(cp->css.cgroup) && cpumask_empty(new_cpus))
883 cpumask_copy(new_cpus, parent->effective_cpus);
884
885
886 if (cpumask_equal(new_cpus, cp->effective_cpus)) {
887 pos_css = css_rightmost_descendant(pos_css);
888 continue;
889 }
890
891 if (!css_tryget_online(&cp->css))
892 continue;
893 rcu_read_unlock();
894
895 spin_lock_irq(&callback_lock);
896 cpumask_copy(cp->effective_cpus, new_cpus);
897 spin_unlock_irq(&callback_lock);
898
899 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
900 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
901
902 update_tasks_cpumask(cp);
903
904
905
906
907
908 if (!cpumask_empty(cp->cpus_allowed) &&
909 is_sched_load_balance(cp))
910 need_rebuild_sched_domains = true;
911
912 rcu_read_lock();
913 css_put(&cp->css);
914 }
915 rcu_read_unlock();
916
917 if (need_rebuild_sched_domains)
918 rebuild_sched_domains_locked();
919}
920
921
922
923
924
925
926
927static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
928 const char *buf)
929{
930 int retval;
931
932
933 if (cs == &top_cpuset)
934 return -EACCES;
935
936
937
938
939
940
941
942 if (!*buf) {
943 cpumask_clear(trialcs->cpus_allowed);
944 } else {
945 retval = cpulist_parse(buf, trialcs->cpus_allowed);
946 if (retval < 0)
947 return retval;
948
949 if (!cpumask_subset(trialcs->cpus_allowed,
950 top_cpuset.cpus_allowed))
951 return -EINVAL;
952 }
953
954
955 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
956 return 0;
957
958 retval = validate_change(cs, trialcs);
959 if (retval < 0)
960 return retval;
961
962 spin_lock_irq(&callback_lock);
963 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
964 spin_unlock_irq(&callback_lock);
965
966
967 update_cpumasks_hier(cs, trialcs->cpus_allowed);
968 return 0;
969}
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
986 const nodemask_t *to)
987{
988 struct task_struct *tsk = current;
989
990 tsk->mems_allowed = *to;
991
992 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
993
994 rcu_read_lock();
995 guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
996 rcu_read_unlock();
997}
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008static void cpuset_change_task_nodemask(struct task_struct *tsk,
1009 nodemask_t *newmems)
1010{
1011 bool need_loop;
1012
1013
1014
1015
1016
1017 if (unlikely(test_thread_flag(TIF_MEMDIE)))
1018 return;
1019 if (current->flags & PF_EXITING)
1020 return;
1021
1022 task_lock(tsk);
1023
1024
1025
1026
1027
1028
1029 need_loop = task_has_mempolicy(tsk) ||
1030 !nodes_intersects(*newmems, tsk->mems_allowed);
1031
1032 if (need_loop) {
1033 local_irq_disable();
1034 write_seqcount_begin(&tsk->mems_allowed_seq);
1035 }
1036
1037 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
1038 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
1039
1040 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
1041 tsk->mems_allowed = *newmems;
1042
1043 if (need_loop) {
1044 write_seqcount_end(&tsk->mems_allowed_seq);
1045 local_irq_enable();
1046 }
1047
1048 task_unlock(tsk);
1049}
1050
1051static void *cpuset_being_rebound;
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061static void update_tasks_nodemask(struct cpuset *cs)
1062{
1063 static nodemask_t newmems;
1064 struct css_task_iter it;
1065 struct task_struct *task;
1066
1067 cpuset_being_rebound = cs;
1068
1069 guarantee_online_mems(cs, &newmems);
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081 css_task_iter_start(&cs->css, &it);
1082 while ((task = css_task_iter_next(&it))) {
1083 struct mm_struct *mm;
1084 bool migrate;
1085
1086 cpuset_change_task_nodemask(task, &newmems);
1087
1088 mm = get_task_mm(task);
1089 if (!mm)
1090 continue;
1091
1092 migrate = is_memory_migrate(cs);
1093
1094 mpol_rebind_mm(mm, &cs->mems_allowed);
1095 if (migrate)
1096 cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
1097 mmput(mm);
1098 }
1099 css_task_iter_end(&it);
1100
1101
1102
1103
1104
1105 cs->old_mems_allowed = newmems;
1106
1107
1108 cpuset_being_rebound = NULL;
1109}
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1124{
1125 struct cpuset *cp;
1126 struct cgroup_subsys_state *pos_css;
1127
1128 rcu_read_lock();
1129 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1130 struct cpuset *parent = parent_cs(cp);
1131
1132 nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
1133
1134
1135
1136
1137
1138 if (cgroup_on_dfl(cp->css.cgroup) && nodes_empty(*new_mems))
1139 *new_mems = parent->effective_mems;
1140
1141
1142 if (nodes_equal(*new_mems, cp->effective_mems)) {
1143 pos_css = css_rightmost_descendant(pos_css);
1144 continue;
1145 }
1146
1147 if (!css_tryget_online(&cp->css))
1148 continue;
1149 rcu_read_unlock();
1150
1151 spin_lock_irq(&callback_lock);
1152 cp->effective_mems = *new_mems;
1153 spin_unlock_irq(&callback_lock);
1154
1155 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
1156 !nodes_equal(cp->mems_allowed, cp->effective_mems));
1157
1158 update_tasks_nodemask(cp);
1159
1160 rcu_read_lock();
1161 css_put(&cp->css);
1162 }
1163 rcu_read_unlock();
1164}
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1180 const char *buf)
1181{
1182 int retval;
1183
1184
1185
1186
1187
1188 if (cs == &top_cpuset) {
1189 retval = -EACCES;
1190 goto done;
1191 }
1192
1193
1194
1195
1196
1197
1198
1199 if (!*buf) {
1200 nodes_clear(trialcs->mems_allowed);
1201 } else {
1202 retval = nodelist_parse(buf, trialcs->mems_allowed);
1203 if (retval < 0)
1204 goto done;
1205
1206 if (!nodes_subset(trialcs->mems_allowed,
1207 top_cpuset.mems_allowed)) {
1208 retval = -EINVAL;
1209 goto done;
1210 }
1211 }
1212
1213 if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
1214 retval = 0;
1215 goto done;
1216 }
1217 retval = validate_change(cs, trialcs);
1218 if (retval < 0)
1219 goto done;
1220
1221 spin_lock_irq(&callback_lock);
1222 cs->mems_allowed = trialcs->mems_allowed;
1223 spin_unlock_irq(&callback_lock);
1224
1225
1226 update_nodemasks_hier(cs, &cs->mems_allowed);
1227done:
1228 return retval;
1229}
1230
1231int current_cpuset_is_being_rebound(void)
1232{
1233 int ret;
1234
1235 rcu_read_lock();
1236 ret = task_cs(current) == cpuset_being_rebound;
1237 rcu_read_unlock();
1238
1239 return ret;
1240}
1241
1242static int update_relax_domain_level(struct cpuset *cs, s64 val)
1243{
1244#ifdef CONFIG_SMP
1245 if (val < -1 || val >= sched_domain_level_max)
1246 return -EINVAL;
1247#endif
1248
1249 if (val != cs->relax_domain_level) {
1250 cs->relax_domain_level = val;
1251 if (!cpumask_empty(cs->cpus_allowed) &&
1252 is_sched_load_balance(cs))
1253 rebuild_sched_domains_locked();
1254 }
1255
1256 return 0;
1257}
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267static void update_tasks_flags(struct cpuset *cs)
1268{
1269 struct css_task_iter it;
1270 struct task_struct *task;
1271
1272 css_task_iter_start(&cs->css, &it);
1273 while ((task = css_task_iter_next(&it)))
1274 cpuset_update_task_spread_flag(cs, task);
1275 css_task_iter_end(&it);
1276}
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1288 int turning_on)
1289{
1290 struct cpuset *trialcs;
1291 int balance_flag_changed;
1292 int spread_flag_changed;
1293 int err;
1294
1295 trialcs = alloc_trial_cpuset(cs);
1296 if (!trialcs)
1297 return -ENOMEM;
1298
1299 if (turning_on)
1300 set_bit(bit, &trialcs->flags);
1301 else
1302 clear_bit(bit, &trialcs->flags);
1303
1304 err = validate_change(cs, trialcs);
1305 if (err < 0)
1306 goto out;
1307
1308 balance_flag_changed = (is_sched_load_balance(cs) !=
1309 is_sched_load_balance(trialcs));
1310
1311 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1312 || (is_spread_page(cs) != is_spread_page(trialcs)));
1313
1314 spin_lock_irq(&callback_lock);
1315 cs->flags = trialcs->flags;
1316 spin_unlock_irq(&callback_lock);
1317
1318 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1319 rebuild_sched_domains_locked();
1320
1321 if (spread_flag_changed)
1322 update_tasks_flags(cs);
1323out:
1324 free_trial_cpuset(trialcs);
1325 return err;
1326}
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373#define FM_COEF 933
1374#define FM_MAXTICKS ((time_t)99)
1375#define FM_MAXCNT 1000000
1376#define FM_SCALE 1000
1377
1378
1379static void fmeter_init(struct fmeter *fmp)
1380{
1381 fmp->cnt = 0;
1382 fmp->val = 0;
1383 fmp->time = 0;
1384 spin_lock_init(&fmp->lock);
1385}
1386
1387
1388static void fmeter_update(struct fmeter *fmp)
1389{
1390 time_t now = get_seconds();
1391 time_t ticks = now - fmp->time;
1392
1393 if (ticks == 0)
1394 return;
1395
1396 ticks = min(FM_MAXTICKS, ticks);
1397 while (ticks-- > 0)
1398 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
1399 fmp->time = now;
1400
1401 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
1402 fmp->cnt = 0;
1403}
1404
1405
1406static void fmeter_markevent(struct fmeter *fmp)
1407{
1408 spin_lock(&fmp->lock);
1409 fmeter_update(fmp);
1410 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
1411 spin_unlock(&fmp->lock);
1412}
1413
1414
1415static int fmeter_getrate(struct fmeter *fmp)
1416{
1417 int val;
1418
1419 spin_lock(&fmp->lock);
1420 fmeter_update(fmp);
1421 val = fmp->val;
1422 spin_unlock(&fmp->lock);
1423 return val;
1424}
1425
1426static struct cpuset *cpuset_attach_old_cs;
1427
1428
1429static int cpuset_can_attach(struct cgroup_subsys_state *css,
1430 struct cgroup_taskset *tset)
1431{
1432 struct cpuset *cs = css_cs(css);
1433 struct task_struct *task;
1434 int ret;
1435
1436
1437 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset));
1438
1439 mutex_lock(&cpuset_mutex);
1440
1441
1442 ret = -ENOSPC;
1443 if (!cgroup_on_dfl(css->cgroup) &&
1444 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1445 goto out_unlock;
1446
1447 cgroup_taskset_for_each(task, tset) {
1448 ret = task_can_attach(task, cs->cpus_allowed);
1449 if (ret)
1450 goto out_unlock;
1451 ret = security_task_setscheduler(task);
1452 if (ret)
1453 goto out_unlock;
1454 }
1455
1456
1457
1458
1459
1460 cs->attach_in_progress++;
1461 ret = 0;
1462out_unlock:
1463 mutex_unlock(&cpuset_mutex);
1464 return ret;
1465}
1466
1467static void cpuset_cancel_attach(struct cgroup_subsys_state *css,
1468 struct cgroup_taskset *tset)
1469{
1470 mutex_lock(&cpuset_mutex);
1471 css_cs(css)->attach_in_progress--;
1472 mutex_unlock(&cpuset_mutex);
1473}
1474
1475
1476
1477
1478
1479
1480static cpumask_var_t cpus_attach;
1481
1482static void cpuset_attach(struct cgroup_subsys_state *css,
1483 struct cgroup_taskset *tset)
1484{
1485
1486 static nodemask_t cpuset_attach_nodemask_to;
1487 struct mm_struct *mm;
1488 struct task_struct *task;
1489 struct task_struct *leader = cgroup_taskset_first(tset);
1490 struct cpuset *cs = css_cs(css);
1491 struct cpuset *oldcs = cpuset_attach_old_cs;
1492
1493 mutex_lock(&cpuset_mutex);
1494
1495
1496 if (cs == &top_cpuset)
1497 cpumask_copy(cpus_attach, cpu_possible_mask);
1498 else
1499 guarantee_online_cpus(cs, cpus_attach);
1500
1501 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
1502
1503 cgroup_taskset_for_each(task, tset) {
1504
1505
1506
1507
1508 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
1509
1510 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
1511 cpuset_update_task_spread_flag(cs, task);
1512 }
1513
1514
1515
1516
1517
1518 cpuset_attach_nodemask_to = cs->effective_mems;
1519 mm = get_task_mm(leader);
1520 if (mm) {
1521 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1522
1523
1524
1525
1526
1527
1528
1529
1530 if (is_memory_migrate(cs)) {
1531 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
1532 &cpuset_attach_nodemask_to);
1533 }
1534 mmput(mm);
1535 }
1536
1537 cs->old_mems_allowed = cpuset_attach_nodemask_to;
1538
1539 cs->attach_in_progress--;
1540 if (!cs->attach_in_progress)
1541 wake_up(&cpuset_attach_wq);
1542
1543 mutex_unlock(&cpuset_mutex);
1544}
1545
1546
1547
1548typedef enum {
1549 FILE_MEMORY_MIGRATE,
1550 FILE_CPULIST,
1551 FILE_MEMLIST,
1552 FILE_EFFECTIVE_CPULIST,
1553 FILE_EFFECTIVE_MEMLIST,
1554 FILE_CPU_EXCLUSIVE,
1555 FILE_MEM_EXCLUSIVE,
1556 FILE_MEM_HARDWALL,
1557 FILE_SCHED_LOAD_BALANCE,
1558 FILE_SCHED_RELAX_DOMAIN_LEVEL,
1559 FILE_MEMORY_PRESSURE_ENABLED,
1560 FILE_MEMORY_PRESSURE,
1561 FILE_SPREAD_PAGE,
1562 FILE_SPREAD_SLAB,
1563} cpuset_filetype_t;
1564
1565static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
1566 u64 val)
1567{
1568 struct cpuset *cs = css_cs(css);
1569 cpuset_filetype_t type = cft->private;
1570 int retval = 0;
1571
1572 mutex_lock(&cpuset_mutex);
1573 if (!is_cpuset_online(cs)) {
1574 retval = -ENODEV;
1575 goto out_unlock;
1576 }
1577
1578 switch (type) {
1579 case FILE_CPU_EXCLUSIVE:
1580 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
1581 break;
1582 case FILE_MEM_EXCLUSIVE:
1583 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
1584 break;
1585 case FILE_MEM_HARDWALL:
1586 retval = update_flag(CS_MEM_HARDWALL, cs, val);
1587 break;
1588 case FILE_SCHED_LOAD_BALANCE:
1589 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
1590 break;
1591 case FILE_MEMORY_MIGRATE:
1592 retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
1593 break;
1594 case FILE_MEMORY_PRESSURE_ENABLED:
1595 cpuset_memory_pressure_enabled = !!val;
1596 break;
1597 case FILE_MEMORY_PRESSURE:
1598 retval = -EACCES;
1599 break;
1600 case FILE_SPREAD_PAGE:
1601 retval = update_flag(CS_SPREAD_PAGE, cs, val);
1602 break;
1603 case FILE_SPREAD_SLAB:
1604 retval = update_flag(CS_SPREAD_SLAB, cs, val);
1605 break;
1606 default:
1607 retval = -EINVAL;
1608 break;
1609 }
1610out_unlock:
1611 mutex_unlock(&cpuset_mutex);
1612 return retval;
1613}
1614
1615static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
1616 s64 val)
1617{
1618 struct cpuset *cs = css_cs(css);
1619 cpuset_filetype_t type = cft->private;
1620 int retval = -ENODEV;
1621
1622 mutex_lock(&cpuset_mutex);
1623 if (!is_cpuset_online(cs))
1624 goto out_unlock;
1625
1626 switch (type) {
1627 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1628 retval = update_relax_domain_level(cs, val);
1629 break;
1630 default:
1631 retval = -EINVAL;
1632 break;
1633 }
1634out_unlock:
1635 mutex_unlock(&cpuset_mutex);
1636 return retval;
1637}
1638
1639
1640
1641
1642static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
1643 char *buf, size_t nbytes, loff_t off)
1644{
1645 struct cpuset *cs = css_cs(of_css(of));
1646 struct cpuset *trialcs;
1647 int retval = -ENODEV;
1648
1649 buf = strstrip(buf);
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670 css_get(&cs->css);
1671 kernfs_break_active_protection(of->kn);
1672 flush_work(&cpuset_hotplug_work);
1673
1674 mutex_lock(&cpuset_mutex);
1675 if (!is_cpuset_online(cs))
1676 goto out_unlock;
1677
1678 trialcs = alloc_trial_cpuset(cs);
1679 if (!trialcs) {
1680 retval = -ENOMEM;
1681 goto out_unlock;
1682 }
1683
1684 switch (of_cft(of)->private) {
1685 case FILE_CPULIST:
1686 retval = update_cpumask(cs, trialcs, buf);
1687 break;
1688 case FILE_MEMLIST:
1689 retval = update_nodemask(cs, trialcs, buf);
1690 break;
1691 default:
1692 retval = -EINVAL;
1693 break;
1694 }
1695
1696 free_trial_cpuset(trialcs);
1697out_unlock:
1698 mutex_unlock(&cpuset_mutex);
1699 kernfs_unbreak_active_protection(of->kn);
1700 css_put(&cs->css);
1701 return retval ?: nbytes;
1702}
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712static int cpuset_common_seq_show(struct seq_file *sf, void *v)
1713{
1714 struct cpuset *cs = css_cs(seq_css(sf));
1715 cpuset_filetype_t type = seq_cft(sf)->private;
1716 int ret = 0;
1717
1718 spin_lock_irq(&callback_lock);
1719
1720 switch (type) {
1721 case FILE_CPULIST:
1722 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
1723 break;
1724 case FILE_MEMLIST:
1725 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
1726 break;
1727 case FILE_EFFECTIVE_CPULIST:
1728 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
1729 break;
1730 case FILE_EFFECTIVE_MEMLIST:
1731 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
1732 break;
1733 default:
1734 ret = -EINVAL;
1735 }
1736
1737 spin_unlock_irq(&callback_lock);
1738 return ret;
1739}
1740
1741static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
1742{
1743 struct cpuset *cs = css_cs(css);
1744 cpuset_filetype_t type = cft->private;
1745 switch (type) {
1746 case FILE_CPU_EXCLUSIVE:
1747 return is_cpu_exclusive(cs);
1748 case FILE_MEM_EXCLUSIVE:
1749 return is_mem_exclusive(cs);
1750 case FILE_MEM_HARDWALL:
1751 return is_mem_hardwall(cs);
1752 case FILE_SCHED_LOAD_BALANCE:
1753 return is_sched_load_balance(cs);
1754 case FILE_MEMORY_MIGRATE:
1755 return is_memory_migrate(cs);
1756 case FILE_MEMORY_PRESSURE_ENABLED:
1757 return cpuset_memory_pressure_enabled;
1758 case FILE_MEMORY_PRESSURE:
1759 return fmeter_getrate(&cs->fmeter);
1760 case FILE_SPREAD_PAGE:
1761 return is_spread_page(cs);
1762 case FILE_SPREAD_SLAB:
1763 return is_spread_slab(cs);
1764 default:
1765 BUG();
1766 }
1767
1768
1769 return 0;
1770}
1771
1772static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
1773{
1774 struct cpuset *cs = css_cs(css);
1775 cpuset_filetype_t type = cft->private;
1776 switch (type) {
1777 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1778 return cs->relax_domain_level;
1779 default:
1780 BUG();
1781 }
1782
1783
1784 return 0;
1785}
1786
1787
1788
1789
1790
1791
1792static struct cftype files[] = {
1793 {
1794 .name = "cpus",
1795 .seq_show = cpuset_common_seq_show,
1796 .write = cpuset_write_resmask,
1797 .max_write_len = (100U + 6 * NR_CPUS),
1798 .private = FILE_CPULIST,
1799 },
1800
1801 {
1802 .name = "mems",
1803 .seq_show = cpuset_common_seq_show,
1804 .write = cpuset_write_resmask,
1805 .max_write_len = (100U + 6 * MAX_NUMNODES),
1806 .private = FILE_MEMLIST,
1807 },
1808
1809 {
1810 .name = "effective_cpus",
1811 .seq_show = cpuset_common_seq_show,
1812 .private = FILE_EFFECTIVE_CPULIST,
1813 },
1814
1815 {
1816 .name = "effective_mems",
1817 .seq_show = cpuset_common_seq_show,
1818 .private = FILE_EFFECTIVE_MEMLIST,
1819 },
1820
1821 {
1822 .name = "cpu_exclusive",
1823 .read_u64 = cpuset_read_u64,
1824 .write_u64 = cpuset_write_u64,
1825 .private = FILE_CPU_EXCLUSIVE,
1826 },
1827
1828 {
1829 .name = "mem_exclusive",
1830 .read_u64 = cpuset_read_u64,
1831 .write_u64 = cpuset_write_u64,
1832 .private = FILE_MEM_EXCLUSIVE,
1833 },
1834
1835 {
1836 .name = "mem_hardwall",
1837 .read_u64 = cpuset_read_u64,
1838 .write_u64 = cpuset_write_u64,
1839 .private = FILE_MEM_HARDWALL,
1840 },
1841
1842 {
1843 .name = "sched_load_balance",
1844 .read_u64 = cpuset_read_u64,
1845 .write_u64 = cpuset_write_u64,
1846 .private = FILE_SCHED_LOAD_BALANCE,
1847 },
1848
1849 {
1850 .name = "sched_relax_domain_level",
1851 .read_s64 = cpuset_read_s64,
1852 .write_s64 = cpuset_write_s64,
1853 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
1854 },
1855
1856 {
1857 .name = "memory_migrate",
1858 .read_u64 = cpuset_read_u64,
1859 .write_u64 = cpuset_write_u64,
1860 .private = FILE_MEMORY_MIGRATE,
1861 },
1862
1863 {
1864 .name = "memory_pressure",
1865 .read_u64 = cpuset_read_u64,
1866 .write_u64 = cpuset_write_u64,
1867 .private = FILE_MEMORY_PRESSURE,
1868 .mode = S_IRUGO,
1869 },
1870
1871 {
1872 .name = "memory_spread_page",
1873 .read_u64 = cpuset_read_u64,
1874 .write_u64 = cpuset_write_u64,
1875 .private = FILE_SPREAD_PAGE,
1876 },
1877
1878 {
1879 .name = "memory_spread_slab",
1880 .read_u64 = cpuset_read_u64,
1881 .write_u64 = cpuset_write_u64,
1882 .private = FILE_SPREAD_SLAB,
1883 },
1884
1885 {
1886 .name = "memory_pressure_enabled",
1887 .flags = CFTYPE_ONLY_ON_ROOT,
1888 .read_u64 = cpuset_read_u64,
1889 .write_u64 = cpuset_write_u64,
1890 .private = FILE_MEMORY_PRESSURE_ENABLED,
1891 },
1892
1893 { }
1894};
1895
1896
1897
1898
1899
1900
1901static struct cgroup_subsys_state *
1902cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
1903{
1904 struct cpuset *cs;
1905
1906 if (!parent_css)
1907 return &top_cpuset.css;
1908
1909 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
1910 if (!cs)
1911 return ERR_PTR(-ENOMEM);
1912 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
1913 goto free_cs;
1914 if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
1915 goto free_cpus;
1916
1917 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1918 cpumask_clear(cs->cpus_allowed);
1919 nodes_clear(cs->mems_allowed);
1920 cpumask_clear(cs->effective_cpus);
1921 nodes_clear(cs->effective_mems);
1922 fmeter_init(&cs->fmeter);
1923 cs->relax_domain_level = -1;
1924
1925 return &cs->css;
1926
1927free_cpus:
1928 free_cpumask_var(cs->cpus_allowed);
1929free_cs:
1930 kfree(cs);
1931 return ERR_PTR(-ENOMEM);
1932}
1933
1934static int cpuset_css_online(struct cgroup_subsys_state *css)
1935{
1936 struct cpuset *cs = css_cs(css);
1937 struct cpuset *parent = parent_cs(cs);
1938 struct cpuset *tmp_cs;
1939 struct cgroup_subsys_state *pos_css;
1940
1941 if (!parent)
1942 return 0;
1943
1944 mutex_lock(&cpuset_mutex);
1945
1946 set_bit(CS_ONLINE, &cs->flags);
1947 if (is_spread_page(parent))
1948 set_bit(CS_SPREAD_PAGE, &cs->flags);
1949 if (is_spread_slab(parent))
1950 set_bit(CS_SPREAD_SLAB, &cs->flags);
1951
1952 cpuset_inc();
1953
1954 spin_lock_irq(&callback_lock);
1955 if (cgroup_on_dfl(cs->css.cgroup)) {
1956 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
1957 cs->effective_mems = parent->effective_mems;
1958 }
1959 spin_unlock_irq(&callback_lock);
1960
1961 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
1962 goto out_unlock;
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977 rcu_read_lock();
1978 cpuset_for_each_child(tmp_cs, pos_css, parent) {
1979 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
1980 rcu_read_unlock();
1981 goto out_unlock;
1982 }
1983 }
1984 rcu_read_unlock();
1985
1986 spin_lock_irq(&callback_lock);
1987 cs->mems_allowed = parent->mems_allowed;
1988 cs->effective_mems = parent->mems_allowed;
1989 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
1990 cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
1991 spin_unlock_irq(&callback_lock);
1992out_unlock:
1993 mutex_unlock(&cpuset_mutex);
1994 return 0;
1995}
1996
1997
1998
1999
2000
2001
2002
2003static void cpuset_css_offline(struct cgroup_subsys_state *css)
2004{
2005 struct cpuset *cs = css_cs(css);
2006
2007 mutex_lock(&cpuset_mutex);
2008
2009 if (is_sched_load_balance(cs))
2010 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
2011
2012 cpuset_dec();
2013 clear_bit(CS_ONLINE, &cs->flags);
2014
2015 mutex_unlock(&cpuset_mutex);
2016}
2017
2018static void cpuset_css_free(struct cgroup_subsys_state *css)
2019{
2020 struct cpuset *cs = css_cs(css);
2021
2022 free_cpumask_var(cs->effective_cpus);
2023 free_cpumask_var(cs->cpus_allowed);
2024 kfree(cs);
2025}
2026
2027static void cpuset_bind(struct cgroup_subsys_state *root_css)
2028{
2029 mutex_lock(&cpuset_mutex);
2030 spin_lock_irq(&callback_lock);
2031
2032 if (cgroup_on_dfl(root_css->cgroup)) {
2033 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
2034 top_cpuset.mems_allowed = node_possible_map;
2035 } else {
2036 cpumask_copy(top_cpuset.cpus_allowed,
2037 top_cpuset.effective_cpus);
2038 top_cpuset.mems_allowed = top_cpuset.effective_mems;
2039 }
2040
2041 spin_unlock_irq(&callback_lock);
2042 mutex_unlock(&cpuset_mutex);
2043}
2044
2045struct cgroup_subsys cpuset_cgrp_subsys = {
2046 .css_alloc = cpuset_css_alloc,
2047 .css_online = cpuset_css_online,
2048 .css_offline = cpuset_css_offline,
2049 .css_free = cpuset_css_free,
2050 .can_attach = cpuset_can_attach,
2051 .cancel_attach = cpuset_cancel_attach,
2052 .attach = cpuset_attach,
2053 .bind = cpuset_bind,
2054 .legacy_cftypes = files,
2055 .early_init = 1,
2056};
2057
2058
2059
2060
2061
2062
2063
2064int __init cpuset_init(void)
2065{
2066 int err = 0;
2067
2068 if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
2069 BUG();
2070 if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
2071 BUG();
2072
2073 cpumask_setall(top_cpuset.cpus_allowed);
2074 nodes_setall(top_cpuset.mems_allowed);
2075 cpumask_setall(top_cpuset.effective_cpus);
2076 nodes_setall(top_cpuset.effective_mems);
2077
2078 fmeter_init(&top_cpuset.fmeter);
2079 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
2080 top_cpuset.relax_domain_level = -1;
2081
2082 err = register_filesystem(&cpuset_fs_type);
2083 if (err < 0)
2084 return err;
2085
2086 if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
2087 BUG();
2088
2089 return 0;
2090}
2091
2092
2093
2094
2095
2096
2097
2098
2099static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2100{
2101 struct cpuset *parent;
2102
2103
2104
2105
2106
2107 parent = parent_cs(cs);
2108 while (cpumask_empty(parent->cpus_allowed) ||
2109 nodes_empty(parent->mems_allowed))
2110 parent = parent_cs(parent);
2111
2112 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
2113 pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
2114 pr_cont_cgroup_name(cs->css.cgroup);
2115 pr_cont("\n");
2116 }
2117}
2118
2119static void
2120hotplug_update_tasks_legacy(struct cpuset *cs,
2121 struct cpumask *new_cpus, nodemask_t *new_mems,
2122 bool cpus_updated, bool mems_updated)
2123{
2124 bool is_empty;
2125
2126 spin_lock_irq(&callback_lock);
2127 cpumask_copy(cs->cpus_allowed, new_cpus);
2128 cpumask_copy(cs->effective_cpus, new_cpus);
2129 cs->mems_allowed = *new_mems;
2130 cs->effective_mems = *new_mems;
2131 spin_unlock_irq(&callback_lock);
2132
2133
2134
2135
2136
2137 if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
2138 update_tasks_cpumask(cs);
2139 if (mems_updated && !nodes_empty(cs->mems_allowed))
2140 update_tasks_nodemask(cs);
2141
2142 is_empty = cpumask_empty(cs->cpus_allowed) ||
2143 nodes_empty(cs->mems_allowed);
2144
2145 mutex_unlock(&cpuset_mutex);
2146
2147
2148
2149
2150
2151
2152 if (is_empty)
2153 remove_tasks_in_empty_cpuset(cs);
2154
2155 mutex_lock(&cpuset_mutex);
2156}
2157
2158static void
2159hotplug_update_tasks(struct cpuset *cs,
2160 struct cpumask *new_cpus, nodemask_t *new_mems,
2161 bool cpus_updated, bool mems_updated)
2162{
2163 if (cpumask_empty(new_cpus))
2164 cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
2165 if (nodes_empty(*new_mems))
2166 *new_mems = parent_cs(cs)->effective_mems;
2167
2168 spin_lock_irq(&callback_lock);
2169 cpumask_copy(cs->effective_cpus, new_cpus);
2170 cs->effective_mems = *new_mems;
2171 spin_unlock_irq(&callback_lock);
2172
2173 if (cpus_updated)
2174 update_tasks_cpumask(cs);
2175 if (mems_updated)
2176 update_tasks_nodemask(cs);
2177}
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187static void cpuset_hotplug_update_tasks(struct cpuset *cs)
2188{
2189 static cpumask_t new_cpus;
2190 static nodemask_t new_mems;
2191 bool cpus_updated;
2192 bool mems_updated;
2193retry:
2194 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
2195
2196 mutex_lock(&cpuset_mutex);
2197
2198
2199
2200
2201
2202 if (cs->attach_in_progress) {
2203 mutex_unlock(&cpuset_mutex);
2204 goto retry;
2205 }
2206
2207 cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
2208 nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
2209
2210 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
2211 mems_updated = !nodes_equal(new_mems, cs->effective_mems);
2212
2213 if (cgroup_on_dfl(cs->css.cgroup))
2214 hotplug_update_tasks(cs, &new_cpus, &new_mems,
2215 cpus_updated, mems_updated);
2216 else
2217 hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
2218 cpus_updated, mems_updated);
2219
2220 mutex_unlock(&cpuset_mutex);
2221}
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239static void cpuset_hotplug_workfn(struct work_struct *work)
2240{
2241 static cpumask_t new_cpus;
2242 static nodemask_t new_mems;
2243 bool cpus_updated, mems_updated;
2244 bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
2245
2246 mutex_lock(&cpuset_mutex);
2247
2248
2249 cpumask_copy(&new_cpus, cpu_active_mask);
2250 new_mems = node_states[N_MEMORY];
2251
2252 cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
2253 mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
2254
2255
2256 if (cpus_updated) {
2257 spin_lock_irq(&callback_lock);
2258 if (!on_dfl)
2259 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
2260 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
2261 spin_unlock_irq(&callback_lock);
2262
2263 }
2264
2265
2266 if (mems_updated) {
2267 spin_lock_irq(&callback_lock);
2268 if (!on_dfl)
2269 top_cpuset.mems_allowed = new_mems;
2270 top_cpuset.effective_mems = new_mems;
2271 spin_unlock_irq(&callback_lock);
2272 update_tasks_nodemask(&top_cpuset);
2273 }
2274
2275 mutex_unlock(&cpuset_mutex);
2276
2277
2278 if (cpus_updated || mems_updated) {
2279 struct cpuset *cs;
2280 struct cgroup_subsys_state *pos_css;
2281
2282 rcu_read_lock();
2283 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
2284 if (cs == &top_cpuset || !css_tryget_online(&cs->css))
2285 continue;
2286 rcu_read_unlock();
2287
2288 cpuset_hotplug_update_tasks(cs);
2289
2290 rcu_read_lock();
2291 css_put(&cs->css);
2292 }
2293 rcu_read_unlock();
2294 }
2295
2296
2297 if (cpus_updated)
2298 rebuild_sched_domains();
2299}
2300
2301void cpuset_update_active_cpus(bool cpu_online)
2302{
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313 partition_sched_domains(1, NULL, NULL);
2314 schedule_work(&cpuset_hotplug_work);
2315}
2316
2317
2318
2319
2320
2321
2322static int cpuset_track_online_nodes(struct notifier_block *self,
2323 unsigned long action, void *arg)
2324{
2325 schedule_work(&cpuset_hotplug_work);
2326 return NOTIFY_OK;
2327}
2328
2329static struct notifier_block cpuset_track_online_nodes_nb = {
2330 .notifier_call = cpuset_track_online_nodes,
2331 .priority = 10,
2332};
2333
2334
2335
2336
2337
2338
2339void __init cpuset_init_smp(void)
2340{
2341 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2342 top_cpuset.mems_allowed = node_states[N_MEMORY];
2343 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
2344
2345 cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
2346 top_cpuset.effective_mems = node_states[N_MEMORY];
2347
2348 register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
2349}
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2363{
2364 unsigned long flags;
2365
2366 spin_lock_irqsave(&callback_lock, flags);
2367 rcu_read_lock();
2368 guarantee_online_cpus(task_cs(tsk), pmask);
2369 rcu_read_unlock();
2370 spin_unlock_irqrestore(&callback_lock, flags);
2371}
2372
2373void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2374{
2375 rcu_read_lock();
2376 do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
2377 rcu_read_unlock();
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396}
2397
2398void __init cpuset_init_current_mems_allowed(void)
2399{
2400 nodes_setall(current->mems_allowed);
2401}
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2414{
2415 nodemask_t mask;
2416 unsigned long flags;
2417
2418 spin_lock_irqsave(&callback_lock, flags);
2419 rcu_read_lock();
2420 guarantee_online_mems(task_cs(tsk), &mask);
2421 rcu_read_unlock();
2422 spin_unlock_irqrestore(&callback_lock, flags);
2423
2424 return mask;
2425}
2426
2427
2428
2429
2430
2431
2432
2433int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2434{
2435 return nodes_intersects(*nodemask, current->mems_allowed);
2436}
2437
2438
2439
2440
2441
2442
2443
2444static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2445{
2446 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
2447 cs = parent_cs(cs);
2448 return cs;
2449}
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491int __cpuset_node_allowed(int node, gfp_t gfp_mask)
2492{
2493 struct cpuset *cs;
2494 int allowed;
2495 unsigned long flags;
2496
2497 if (in_interrupt())
2498 return 1;
2499 if (node_isset(node, current->mems_allowed))
2500 return 1;
2501
2502
2503
2504
2505 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2506 return 1;
2507 if (gfp_mask & __GFP_HARDWALL)
2508 return 0;
2509
2510 if (current->flags & PF_EXITING)
2511 return 1;
2512
2513
2514 spin_lock_irqsave(&callback_lock, flags);
2515
2516 rcu_read_lock();
2517 cs = nearest_hardwall_ancestor(task_cs(current));
2518 allowed = node_isset(node, cs->mems_allowed);
2519 rcu_read_unlock();
2520
2521 spin_unlock_irqrestore(&callback_lock, flags);
2522 return allowed;
2523}
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552static int cpuset_spread_node(int *rotor)
2553{
2554 int node;
2555
2556 node = next_node(*rotor, current->mems_allowed);
2557 if (node == MAX_NUMNODES)
2558 node = first_node(current->mems_allowed);
2559 *rotor = node;
2560 return node;
2561}
2562
2563int cpuset_mem_spread_node(void)
2564{
2565 if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
2566 current->cpuset_mem_spread_rotor =
2567 node_random(¤t->mems_allowed);
2568
2569 return cpuset_spread_node(¤t->cpuset_mem_spread_rotor);
2570}
2571
2572int cpuset_slab_spread_node(void)
2573{
2574 if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
2575 current->cpuset_slab_spread_rotor =
2576 node_random(¤t->mems_allowed);
2577
2578 return cpuset_spread_node(¤t->cpuset_slab_spread_rotor);
2579}
2580
2581EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2595 const struct task_struct *tsk2)
2596{
2597 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2598}
2599
2600
2601
2602
2603
2604
2605
2606
2607void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2608{
2609 struct cgroup *cgrp;
2610
2611 rcu_read_lock();
2612
2613 cgrp = task_cs(tsk)->css.cgroup;
2614 pr_info("%s cpuset=", tsk->comm);
2615 pr_cont_cgroup_name(cgrp);
2616 pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed));
2617
2618 rcu_read_unlock();
2619}
2620
2621
2622
2623
2624
2625
2626
2627int cpuset_memory_pressure_enabled __read_mostly;
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647void __cpuset_memory_pressure_bump(void)
2648{
2649 rcu_read_lock();
2650 fmeter_markevent(&task_cs(current)->fmeter);
2651 rcu_read_unlock();
2652}
2653
2654#ifdef CONFIG_PROC_PID_CPUSET
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
2665 struct pid *pid, struct task_struct *tsk)
2666{
2667 char *buf, *p;
2668 struct cgroup_subsys_state *css;
2669 int retval;
2670
2671 retval = -ENOMEM;
2672 buf = kmalloc(PATH_MAX, GFP_KERNEL);
2673 if (!buf)
2674 goto out;
2675
2676 retval = -ENAMETOOLONG;
2677 rcu_read_lock();
2678 css = task_css(tsk, cpuset_cgrp_id);
2679 p = cgroup_path(css->cgroup, buf, PATH_MAX);
2680 rcu_read_unlock();
2681 if (!p)
2682 goto out_free;
2683 seq_puts(m, p);
2684 seq_putc(m, '\n');
2685 retval = 0;
2686out_free:
2687 kfree(buf);
2688out:
2689 return retval;
2690}
2691#endif
2692
2693
2694void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2695{
2696 seq_printf(m, "Mems_allowed:\t%*pb\n",
2697 nodemask_pr_args(&task->mems_allowed));
2698 seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
2699 nodemask_pr_args(&task->mems_allowed));
2700}
2701