1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include <linux/cpu.h>
26#include <linux/cpumask.h>
27#include <linux/cpuset.h>
28#include <linux/err.h>
29#include <linux/errno.h>
30#include <linux/file.h>
31#include <linux/fs.h>
32#include <linux/init.h>
33#include <linux/interrupt.h>
34#include <linux/kernel.h>
35#include <linux/kmod.h>
36#include <linux/list.h>
37#include <linux/mempolicy.h>
38#include <linux/mm.h>
39#include <linux/memory.h>
40#include <linux/export.h>
41#include <linux/mount.h>
42#include <linux/namei.h>
43#include <linux/pagemap.h>
44#include <linux/proc_fs.h>
45#include <linux/rcupdate.h>
46#include <linux/sched.h>
47#include <linux/seq_file.h>
48#include <linux/security.h>
49#include <linux/slab.h>
50#include <linux/spinlock.h>
51#include <linux/stat.h>
52#include <linux/string.h>
53#include <linux/time.h>
54#include <linux/backing-dev.h>
55#include <linux/sort.h>
56
57#include <asm/uaccess.h>
58#include <linux/atomic.h>
59#include <linux/mutex.h>
60#include <linux/workqueue.h>
61#include <linux/cgroup.h>
62#include <linux/wait.h>
63
64struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
65
66
67
68struct fmeter {
69 int cnt;
70 int val;
71 time_t time;
72 spinlock_t lock;
73};
74
75struct cpuset {
76 struct cgroup_subsys_state css;
77
78 unsigned long flags;
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101 cpumask_var_t cpus_allowed;
102 nodemask_t mems_allowed;
103
104
105 cpumask_var_t effective_cpus;
106 nodemask_t effective_mems;
107
108
109
110
111
112
113
114
115
116
117
118 nodemask_t old_mems_allowed;
119
120 struct fmeter fmeter;
121
122
123
124
125
126 int attach_in_progress;
127
128
129 int pn;
130
131
132 int relax_domain_level;
133};
134
135static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
136{
137 return css ? container_of(css, struct cpuset, css) : NULL;
138}
139
140
141static inline struct cpuset *task_cs(struct task_struct *task)
142{
143 return css_cs(task_css(task, cpuset_cgrp_id));
144}
145
146static inline struct cpuset *parent_cs(struct cpuset *cs)
147{
148 return css_cs(cs->css.parent);
149}
150
151#ifdef CONFIG_NUMA
152static inline bool task_has_mempolicy(struct task_struct *task)
153{
154 return task->mempolicy;
155}
156#else
157static inline bool task_has_mempolicy(struct task_struct *task)
158{
159 return false;
160}
161#endif
162
163
164
165typedef enum {
166 CS_ONLINE,
167 CS_CPU_EXCLUSIVE,
168 CS_MEM_EXCLUSIVE,
169 CS_MEM_HARDWALL,
170 CS_MEMORY_MIGRATE,
171 CS_SCHED_LOAD_BALANCE,
172 CS_SPREAD_PAGE,
173 CS_SPREAD_SLAB,
174} cpuset_flagbits_t;
175
176
177static inline bool is_cpuset_online(const struct cpuset *cs)
178{
179 return test_bit(CS_ONLINE, &cs->flags);
180}
181
182static inline int is_cpu_exclusive(const struct cpuset *cs)
183{
184 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
185}
186
187static inline int is_mem_exclusive(const struct cpuset *cs)
188{
189 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
190}
191
192static inline int is_mem_hardwall(const struct cpuset *cs)
193{
194 return test_bit(CS_MEM_HARDWALL, &cs->flags);
195}
196
197static inline int is_sched_load_balance(const struct cpuset *cs)
198{
199 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
200}
201
202static inline int is_memory_migrate(const struct cpuset *cs)
203{
204 return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
205}
206
207static inline int is_spread_page(const struct cpuset *cs)
208{
209 return test_bit(CS_SPREAD_PAGE, &cs->flags);
210}
211
212static inline int is_spread_slab(const struct cpuset *cs)
213{
214 return test_bit(CS_SPREAD_SLAB, &cs->flags);
215}
216
217static struct cpuset top_cpuset = {
218 .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
219 (1 << CS_MEM_EXCLUSIVE)),
220};
221
222
223
224
225
226
227
228
229
230
231#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
232 css_for_each_child((pos_css), &(parent_cs)->css) \
233 if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
234
235
236
237
238
239
240
241
242
243
244
245
246#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
247 css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
248 if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286static DEFINE_MUTEX(cpuset_mutex);
287static DEFINE_SPINLOCK(callback_lock);
288
289
290
291
292static void cpuset_hotplug_workfn(struct work_struct *work);
293static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
294
295static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
296
297
298
299
300
301
302static struct dentry *cpuset_mount(struct file_system_type *fs_type,
303 int flags, const char *unused_dev_name, void *data)
304{
305 struct file_system_type *cgroup_fs = get_fs_type("cgroup");
306 struct dentry *ret = ERR_PTR(-ENODEV);
307 if (cgroup_fs) {
308 char mountopts[] =
309 "cpuset,noprefix,"
310 "release_agent=/sbin/cpuset_release_agent";
311 ret = cgroup_fs->mount(cgroup_fs, flags,
312 unused_dev_name, mountopts);
313 put_filesystem(cgroup_fs);
314 }
315 return ret;
316}
317
318static struct file_system_type cpuset_fs_type = {
319 .name = "cpuset",
320 .mount = cpuset_mount,
321};
322
323
324
325
326
327
328
329
330
331
332
333
334static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
335{
336 while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
337 cs = parent_cs(cs);
338 cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
339}
340
341
342
343
344
345
346
347
348
349
350
351
352static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
353{
354 while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
355 cs = parent_cs(cs);
356 nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
357}
358
359
360
361
362
363
364static void cpuset_update_task_spread_flag(struct cpuset *cs,
365 struct task_struct *tsk)
366{
367 if (is_spread_page(cs))
368 task_set_spread_page(tsk);
369 else
370 task_clear_spread_page(tsk);
371
372 if (is_spread_slab(cs))
373 task_set_spread_slab(tsk);
374 else
375 task_clear_spread_slab(tsk);
376}
377
378
379
380
381
382
383
384
385
386static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
387{
388 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
389 nodes_subset(p->mems_allowed, q->mems_allowed) &&
390 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
391 is_mem_exclusive(p) <= is_mem_exclusive(q);
392}
393
394
395
396
397
398static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
399{
400 struct cpuset *trial;
401
402 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
403 if (!trial)
404 return NULL;
405
406 if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
407 goto free_cs;
408 if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
409 goto free_cpus;
410
411 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
412 cpumask_copy(trial->effective_cpus, cs->effective_cpus);
413 return trial;
414
415free_cpus:
416 free_cpumask_var(trial->cpus_allowed);
417free_cs:
418 kfree(trial);
419 return NULL;
420}
421
422
423
424
425
426static void free_trial_cpuset(struct cpuset *trial)
427{
428 free_cpumask_var(trial->effective_cpus);
429 free_cpumask_var(trial->cpus_allowed);
430 kfree(trial);
431}
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453static int validate_change(struct cpuset *cur, struct cpuset *trial)
454{
455 struct cgroup_subsys_state *css;
456 struct cpuset *c, *par;
457 int ret;
458
459 rcu_read_lock();
460
461
462 ret = -EBUSY;
463 cpuset_for_each_child(c, css, cur)
464 if (!is_cpuset_subset(c, trial))
465 goto out;
466
467
468 ret = 0;
469 if (cur == &top_cpuset)
470 goto out;
471
472 par = parent_cs(cur);
473
474
475 ret = -EACCES;
476 if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
477 goto out;
478
479
480
481
482
483 ret = -EINVAL;
484 cpuset_for_each_child(c, css, par) {
485 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
486 c != cur &&
487 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
488 goto out;
489 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
490 c != cur &&
491 nodes_intersects(trial->mems_allowed, c->mems_allowed))
492 goto out;
493 }
494
495
496
497
498
499 ret = -ENOSPC;
500 if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) {
501 if (!cpumask_empty(cur->cpus_allowed) &&
502 cpumask_empty(trial->cpus_allowed))
503 goto out;
504 if (!nodes_empty(cur->mems_allowed) &&
505 nodes_empty(trial->mems_allowed))
506 goto out;
507 }
508
509
510
511
512
513 ret = -EBUSY;
514 if (is_cpu_exclusive(cur) &&
515 !cpuset_cpumask_can_shrink(cur->cpus_allowed,
516 trial->cpus_allowed))
517 goto out;
518
519 ret = 0;
520out:
521 rcu_read_unlock();
522 return ret;
523}
524
525#ifdef CONFIG_SMP
526
527
528
529
530static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
531{
532 return cpumask_intersects(a->effective_cpus, b->effective_cpus);
533}
534
535static void
536update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
537{
538 if (dattr->relax_domain_level < c->relax_domain_level)
539 dattr->relax_domain_level = c->relax_domain_level;
540 return;
541}
542
543static void update_domain_attr_tree(struct sched_domain_attr *dattr,
544 struct cpuset *root_cs)
545{
546 struct cpuset *cp;
547 struct cgroup_subsys_state *pos_css;
548
549 rcu_read_lock();
550 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
551 if (cp == root_cs)
552 continue;
553
554
555 if (cpumask_empty(cp->cpus_allowed)) {
556 pos_css = css_rightmost_descendant(pos_css);
557 continue;
558 }
559
560 if (is_sched_load_balance(cp))
561 update_domain_attr(dattr, cp);
562 }
563 rcu_read_unlock();
564}
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620static int generate_sched_domains(cpumask_var_t **domains,
621 struct sched_domain_attr **attributes)
622{
623 struct cpuset *cp;
624 struct cpuset **csa;
625 int csn;
626 int i, j, k;
627 cpumask_var_t *doms;
628 struct sched_domain_attr *dattr;
629 int ndoms = 0;
630 int nslot;
631 struct cgroup_subsys_state *pos_css;
632
633 doms = NULL;
634 dattr = NULL;
635 csa = NULL;
636
637
638 if (is_sched_load_balance(&top_cpuset)) {
639 ndoms = 1;
640 doms = alloc_sched_domains(ndoms);
641 if (!doms)
642 goto done;
643
644 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
645 if (dattr) {
646 *dattr = SD_ATTR_INIT;
647 update_domain_attr_tree(dattr, &top_cpuset);
648 }
649 cpumask_copy(doms[0], top_cpuset.effective_cpus);
650
651 goto done;
652 }
653
654 csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);
655 if (!csa)
656 goto done;
657 csn = 0;
658
659 rcu_read_lock();
660 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
661 if (cp == &top_cpuset)
662 continue;
663
664
665
666
667
668
669
670
671 if (!cpumask_empty(cp->cpus_allowed) &&
672 !is_sched_load_balance(cp))
673 continue;
674
675 if (is_sched_load_balance(cp))
676 csa[csn++] = cp;
677
678
679 pos_css = css_rightmost_descendant(pos_css);
680 }
681 rcu_read_unlock();
682
683 for (i = 0; i < csn; i++)
684 csa[i]->pn = i;
685 ndoms = csn;
686
687restart:
688
689 for (i = 0; i < csn; i++) {
690 struct cpuset *a = csa[i];
691 int apn = a->pn;
692
693 for (j = 0; j < csn; j++) {
694 struct cpuset *b = csa[j];
695 int bpn = b->pn;
696
697 if (apn != bpn && cpusets_overlap(a, b)) {
698 for (k = 0; k < csn; k++) {
699 struct cpuset *c = csa[k];
700
701 if (c->pn == bpn)
702 c->pn = apn;
703 }
704 ndoms--;
705 goto restart;
706 }
707 }
708 }
709
710
711
712
713
714 doms = alloc_sched_domains(ndoms);
715 if (!doms)
716 goto done;
717
718
719
720
721
722 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
723
724 for (nslot = 0, i = 0; i < csn; i++) {
725 struct cpuset *a = csa[i];
726 struct cpumask *dp;
727 int apn = a->pn;
728
729 if (apn < 0) {
730
731 continue;
732 }
733
734 dp = doms[nslot];
735
736 if (nslot == ndoms) {
737 static int warnings = 10;
738 if (warnings) {
739 pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
740 nslot, ndoms, csn, i, apn);
741 warnings--;
742 }
743 continue;
744 }
745
746 cpumask_clear(dp);
747 if (dattr)
748 *(dattr + nslot) = SD_ATTR_INIT;
749 for (j = i; j < csn; j++) {
750 struct cpuset *b = csa[j];
751
752 if (apn == b->pn) {
753 cpumask_or(dp, dp, b->effective_cpus);
754 if (dattr)
755 update_domain_attr_tree(dattr + nslot, b);
756
757
758 b->pn = -1;
759 }
760 }
761 nslot++;
762 }
763 BUG_ON(nslot != ndoms);
764
765done:
766 kfree(csa);
767
768
769
770
771
772 if (doms == NULL)
773 ndoms = 1;
774
775 *domains = doms;
776 *attributes = dattr;
777 return ndoms;
778}
779
780
781
782
783
784
785
786
787
788
789
790
791static void rebuild_sched_domains_locked(void)
792{
793 struct sched_domain_attr *attr;
794 cpumask_var_t *doms;
795 int ndoms;
796
797 lockdep_assert_held(&cpuset_mutex);
798 get_online_cpus();
799
800
801
802
803
804
805 if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
806 goto out;
807
808
809 ndoms = generate_sched_domains(&doms, &attr);
810
811
812 partition_sched_domains(ndoms, doms, attr);
813out:
814 put_online_cpus();
815}
816#else
817static void rebuild_sched_domains_locked(void)
818{
819}
820#endif
821
822void rebuild_sched_domains(void)
823{
824 mutex_lock(&cpuset_mutex);
825 rebuild_sched_domains_locked();
826 mutex_unlock(&cpuset_mutex);
827}
828
829
830
831
832
833
834
835
836
837static void update_tasks_cpumask(struct cpuset *cs)
838{
839 struct css_task_iter it;
840 struct task_struct *task;
841
842 css_task_iter_start(&cs->css, &it);
843 while ((task = css_task_iter_next(&it)))
844 set_cpus_allowed_ptr(task, cs->effective_cpus);
845 css_task_iter_end(&it);
846}
847
848
849
850
851
852
853
854
855
856
857
858
859
860static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
861{
862 struct cpuset *cp;
863 struct cgroup_subsys_state *pos_css;
864 bool need_rebuild_sched_domains = false;
865
866 rcu_read_lock();
867 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
868 struct cpuset *parent = parent_cs(cp);
869
870 cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
871
872
873
874
875
876 if (cpumask_empty(new_cpus))
877 cpumask_copy(new_cpus, parent->effective_cpus);
878
879
880 if (cpumask_equal(new_cpus, cp->effective_cpus)) {
881 pos_css = css_rightmost_descendant(pos_css);
882 continue;
883 }
884
885 if (!css_tryget_online(&cp->css))
886 continue;
887 rcu_read_unlock();
888
889 spin_lock_irq(&callback_lock);
890 cpumask_copy(cp->effective_cpus, new_cpus);
891 spin_unlock_irq(&callback_lock);
892
893 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
894 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
895
896 update_tasks_cpumask(cp);
897
898
899
900
901
902 if (!cpumask_empty(cp->cpus_allowed) &&
903 is_sched_load_balance(cp))
904 need_rebuild_sched_domains = true;
905
906 rcu_read_lock();
907 css_put(&cp->css);
908 }
909 rcu_read_unlock();
910
911 if (need_rebuild_sched_domains)
912 rebuild_sched_domains_locked();
913}
914
915
916
917
918
919
920
921static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
922 const char *buf)
923{
924 int retval;
925
926
927 if (cs == &top_cpuset)
928 return -EACCES;
929
930
931
932
933
934
935
936 if (!*buf) {
937 cpumask_clear(trialcs->cpus_allowed);
938 } else {
939 retval = cpulist_parse(buf, trialcs->cpus_allowed);
940 if (retval < 0)
941 return retval;
942
943 if (!cpumask_subset(trialcs->cpus_allowed,
944 top_cpuset.cpus_allowed))
945 return -EINVAL;
946 }
947
948
949 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
950 return 0;
951
952 retval = validate_change(cs, trialcs);
953 if (retval < 0)
954 return retval;
955
956 spin_lock_irq(&callback_lock);
957 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
958 spin_unlock_irq(&callback_lock);
959
960
961 update_cpumasks_hier(cs, trialcs->cpus_allowed);
962 return 0;
963}
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
980 const nodemask_t *to)
981{
982 struct task_struct *tsk = current;
983
984 tsk->mems_allowed = *to;
985
986 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
987
988 rcu_read_lock();
989 guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
990 rcu_read_unlock();
991}
992
993
994
995
996
997
998
999
1000
1001
1002static void cpuset_change_task_nodemask(struct task_struct *tsk,
1003 nodemask_t *newmems)
1004{
1005 bool need_loop;
1006
1007
1008
1009
1010
1011 if (unlikely(test_thread_flag(TIF_MEMDIE)))
1012 return;
1013 if (current->flags & PF_EXITING)
1014 return;
1015
1016 task_lock(tsk);
1017
1018
1019
1020
1021
1022
1023 need_loop = task_has_mempolicy(tsk) ||
1024 !nodes_intersects(*newmems, tsk->mems_allowed);
1025
1026 if (need_loop) {
1027 local_irq_disable();
1028 write_seqcount_begin(&tsk->mems_allowed_seq);
1029 }
1030
1031 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
1032 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
1033
1034 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
1035 tsk->mems_allowed = *newmems;
1036
1037 if (need_loop) {
1038 write_seqcount_end(&tsk->mems_allowed_seq);
1039 local_irq_enable();
1040 }
1041
1042 task_unlock(tsk);
1043}
1044
1045static void *cpuset_being_rebound;
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055static void update_tasks_nodemask(struct cpuset *cs)
1056{
1057 static nodemask_t newmems;
1058 struct css_task_iter it;
1059 struct task_struct *task;
1060
1061 cpuset_being_rebound = cs;
1062
1063 guarantee_online_mems(cs, &newmems);
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075 css_task_iter_start(&cs->css, &it);
1076 while ((task = css_task_iter_next(&it))) {
1077 struct mm_struct *mm;
1078 bool migrate;
1079
1080 cpuset_change_task_nodemask(task, &newmems);
1081
1082 mm = get_task_mm(task);
1083 if (!mm)
1084 continue;
1085
1086 migrate = is_memory_migrate(cs);
1087
1088 mpol_rebind_mm(mm, &cs->mems_allowed);
1089 if (migrate)
1090 cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
1091 mmput(mm);
1092 }
1093 css_task_iter_end(&it);
1094
1095
1096
1097
1098
1099 cs->old_mems_allowed = newmems;
1100
1101
1102 cpuset_being_rebound = NULL;
1103}
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1118{
1119 struct cpuset *cp;
1120 struct cgroup_subsys_state *pos_css;
1121
1122 rcu_read_lock();
1123 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1124 struct cpuset *parent = parent_cs(cp);
1125
1126 nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
1127
1128
1129
1130
1131
1132 if (nodes_empty(*new_mems))
1133 *new_mems = parent->effective_mems;
1134
1135
1136 if (nodes_equal(*new_mems, cp->effective_mems)) {
1137 pos_css = css_rightmost_descendant(pos_css);
1138 continue;
1139 }
1140
1141 if (!css_tryget_online(&cp->css))
1142 continue;
1143 rcu_read_unlock();
1144
1145 spin_lock_irq(&callback_lock);
1146 cp->effective_mems = *new_mems;
1147 spin_unlock_irq(&callback_lock);
1148
1149 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
1150 !nodes_equal(cp->mems_allowed, cp->effective_mems));
1151
1152 update_tasks_nodemask(cp);
1153
1154 rcu_read_lock();
1155 css_put(&cp->css);
1156 }
1157 rcu_read_unlock();
1158}
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1174 const char *buf)
1175{
1176 int retval;
1177
1178
1179
1180
1181
1182 if (cs == &top_cpuset) {
1183 retval = -EACCES;
1184 goto done;
1185 }
1186
1187
1188
1189
1190
1191
1192
1193 if (!*buf) {
1194 nodes_clear(trialcs->mems_allowed);
1195 } else {
1196 retval = nodelist_parse(buf, trialcs->mems_allowed);
1197 if (retval < 0)
1198 goto done;
1199
1200 if (!nodes_subset(trialcs->mems_allowed,
1201 top_cpuset.mems_allowed)) {
1202 retval = -EINVAL;
1203 goto done;
1204 }
1205 }
1206
1207 if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
1208 retval = 0;
1209 goto done;
1210 }
1211 retval = validate_change(cs, trialcs);
1212 if (retval < 0)
1213 goto done;
1214
1215 spin_lock_irq(&callback_lock);
1216 cs->mems_allowed = trialcs->mems_allowed;
1217 spin_unlock_irq(&callback_lock);
1218
1219
1220 update_nodemasks_hier(cs, &cs->mems_allowed);
1221done:
1222 return retval;
1223}
1224
1225int current_cpuset_is_being_rebound(void)
1226{
1227 int ret;
1228
1229 rcu_read_lock();
1230 ret = task_cs(current) == cpuset_being_rebound;
1231 rcu_read_unlock();
1232
1233 return ret;
1234}
1235
1236static int update_relax_domain_level(struct cpuset *cs, s64 val)
1237{
1238#ifdef CONFIG_SMP
1239 if (val < -1 || val >= sched_domain_level_max)
1240 return -EINVAL;
1241#endif
1242
1243 if (val != cs->relax_domain_level) {
1244 cs->relax_domain_level = val;
1245 if (!cpumask_empty(cs->cpus_allowed) &&
1246 is_sched_load_balance(cs))
1247 rebuild_sched_domains_locked();
1248 }
1249
1250 return 0;
1251}
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261static void update_tasks_flags(struct cpuset *cs)
1262{
1263 struct css_task_iter it;
1264 struct task_struct *task;
1265
1266 css_task_iter_start(&cs->css, &it);
1267 while ((task = css_task_iter_next(&it)))
1268 cpuset_update_task_spread_flag(cs, task);
1269 css_task_iter_end(&it);
1270}
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1282 int turning_on)
1283{
1284 struct cpuset *trialcs;
1285 int balance_flag_changed;
1286 int spread_flag_changed;
1287 int err;
1288
1289 trialcs = alloc_trial_cpuset(cs);
1290 if (!trialcs)
1291 return -ENOMEM;
1292
1293 if (turning_on)
1294 set_bit(bit, &trialcs->flags);
1295 else
1296 clear_bit(bit, &trialcs->flags);
1297
1298 err = validate_change(cs, trialcs);
1299 if (err < 0)
1300 goto out;
1301
1302 balance_flag_changed = (is_sched_load_balance(cs) !=
1303 is_sched_load_balance(trialcs));
1304
1305 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1306 || (is_spread_page(cs) != is_spread_page(trialcs)));
1307
1308 spin_lock_irq(&callback_lock);
1309 cs->flags = trialcs->flags;
1310 spin_unlock_irq(&callback_lock);
1311
1312 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1313 rebuild_sched_domains_locked();
1314
1315 if (spread_flag_changed)
1316 update_tasks_flags(cs);
1317out:
1318 free_trial_cpuset(trialcs);
1319 return err;
1320}
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367#define FM_COEF 933
1368#define FM_MAXTICKS ((time_t)99)
1369#define FM_MAXCNT 1000000
1370#define FM_SCALE 1000
1371
1372
1373static void fmeter_init(struct fmeter *fmp)
1374{
1375 fmp->cnt = 0;
1376 fmp->val = 0;
1377 fmp->time = 0;
1378 spin_lock_init(&fmp->lock);
1379}
1380
1381
1382static void fmeter_update(struct fmeter *fmp)
1383{
1384 time_t now = get_seconds();
1385 time_t ticks = now - fmp->time;
1386
1387 if (ticks == 0)
1388 return;
1389
1390 ticks = min(FM_MAXTICKS, ticks);
1391 while (ticks-- > 0)
1392 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
1393 fmp->time = now;
1394
1395 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
1396 fmp->cnt = 0;
1397}
1398
1399
1400static void fmeter_markevent(struct fmeter *fmp)
1401{
1402 spin_lock(&fmp->lock);
1403 fmeter_update(fmp);
1404 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
1405 spin_unlock(&fmp->lock);
1406}
1407
1408
1409static int fmeter_getrate(struct fmeter *fmp)
1410{
1411 int val;
1412
1413 spin_lock(&fmp->lock);
1414 fmeter_update(fmp);
1415 val = fmp->val;
1416 spin_unlock(&fmp->lock);
1417 return val;
1418}
1419
1420static struct cpuset *cpuset_attach_old_cs;
1421
1422
1423static int cpuset_can_attach(struct cgroup_subsys_state *css,
1424 struct cgroup_taskset *tset)
1425{
1426 struct cpuset *cs = css_cs(css);
1427 struct task_struct *task;
1428 int ret;
1429
1430
1431 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset));
1432
1433 mutex_lock(&cpuset_mutex);
1434
1435
1436 ret = -ENOSPC;
1437 if (!cgroup_on_dfl(css->cgroup) &&
1438 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1439 goto out_unlock;
1440
1441 cgroup_taskset_for_each(task, tset) {
1442 ret = task_can_attach(task, cs->cpus_allowed);
1443 if (ret)
1444 goto out_unlock;
1445 ret = security_task_setscheduler(task);
1446 if (ret)
1447 goto out_unlock;
1448 }
1449
1450
1451
1452
1453
1454 cs->attach_in_progress++;
1455 ret = 0;
1456out_unlock:
1457 mutex_unlock(&cpuset_mutex);
1458 return ret;
1459}
1460
1461static void cpuset_cancel_attach(struct cgroup_subsys_state *css,
1462 struct cgroup_taskset *tset)
1463{
1464 mutex_lock(&cpuset_mutex);
1465 css_cs(css)->attach_in_progress--;
1466 mutex_unlock(&cpuset_mutex);
1467}
1468
1469
1470
1471
1472
1473
1474static cpumask_var_t cpus_attach;
1475
1476static void cpuset_attach(struct cgroup_subsys_state *css,
1477 struct cgroup_taskset *tset)
1478{
1479
1480 static nodemask_t cpuset_attach_nodemask_to;
1481 struct mm_struct *mm;
1482 struct task_struct *task;
1483 struct task_struct *leader = cgroup_taskset_first(tset);
1484 struct cpuset *cs = css_cs(css);
1485 struct cpuset *oldcs = cpuset_attach_old_cs;
1486
1487 mutex_lock(&cpuset_mutex);
1488
1489
1490 if (cs == &top_cpuset)
1491 cpumask_copy(cpus_attach, cpu_possible_mask);
1492 else
1493 guarantee_online_cpus(cs, cpus_attach);
1494
1495 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
1496
1497 cgroup_taskset_for_each(task, tset) {
1498
1499
1500
1501
1502 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
1503
1504 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
1505 cpuset_update_task_spread_flag(cs, task);
1506 }
1507
1508
1509
1510
1511
1512 cpuset_attach_nodemask_to = cs->effective_mems;
1513 mm = get_task_mm(leader);
1514 if (mm) {
1515 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1516
1517
1518
1519
1520
1521
1522
1523
1524 if (is_memory_migrate(cs)) {
1525 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
1526 &cpuset_attach_nodemask_to);
1527 }
1528 mmput(mm);
1529 }
1530
1531 cs->old_mems_allowed = cpuset_attach_nodemask_to;
1532
1533 cs->attach_in_progress--;
1534 if (!cs->attach_in_progress)
1535 wake_up(&cpuset_attach_wq);
1536
1537 mutex_unlock(&cpuset_mutex);
1538}
1539
1540
1541
1542typedef enum {
1543 FILE_MEMORY_MIGRATE,
1544 FILE_CPULIST,
1545 FILE_MEMLIST,
1546 FILE_EFFECTIVE_CPULIST,
1547 FILE_EFFECTIVE_MEMLIST,
1548 FILE_CPU_EXCLUSIVE,
1549 FILE_MEM_EXCLUSIVE,
1550 FILE_MEM_HARDWALL,
1551 FILE_SCHED_LOAD_BALANCE,
1552 FILE_SCHED_RELAX_DOMAIN_LEVEL,
1553 FILE_MEMORY_PRESSURE_ENABLED,
1554 FILE_MEMORY_PRESSURE,
1555 FILE_SPREAD_PAGE,
1556 FILE_SPREAD_SLAB,
1557} cpuset_filetype_t;
1558
1559static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
1560 u64 val)
1561{
1562 struct cpuset *cs = css_cs(css);
1563 cpuset_filetype_t type = cft->private;
1564 int retval = 0;
1565
1566 mutex_lock(&cpuset_mutex);
1567 if (!is_cpuset_online(cs)) {
1568 retval = -ENODEV;
1569 goto out_unlock;
1570 }
1571
1572 switch (type) {
1573 case FILE_CPU_EXCLUSIVE:
1574 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
1575 break;
1576 case FILE_MEM_EXCLUSIVE:
1577 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
1578 break;
1579 case FILE_MEM_HARDWALL:
1580 retval = update_flag(CS_MEM_HARDWALL, cs, val);
1581 break;
1582 case FILE_SCHED_LOAD_BALANCE:
1583 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
1584 break;
1585 case FILE_MEMORY_MIGRATE:
1586 retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
1587 break;
1588 case FILE_MEMORY_PRESSURE_ENABLED:
1589 cpuset_memory_pressure_enabled = !!val;
1590 break;
1591 case FILE_MEMORY_PRESSURE:
1592 retval = -EACCES;
1593 break;
1594 case FILE_SPREAD_PAGE:
1595 retval = update_flag(CS_SPREAD_PAGE, cs, val);
1596 break;
1597 case FILE_SPREAD_SLAB:
1598 retval = update_flag(CS_SPREAD_SLAB, cs, val);
1599 break;
1600 default:
1601 retval = -EINVAL;
1602 break;
1603 }
1604out_unlock:
1605 mutex_unlock(&cpuset_mutex);
1606 return retval;
1607}
1608
1609static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
1610 s64 val)
1611{
1612 struct cpuset *cs = css_cs(css);
1613 cpuset_filetype_t type = cft->private;
1614 int retval = -ENODEV;
1615
1616 mutex_lock(&cpuset_mutex);
1617 if (!is_cpuset_online(cs))
1618 goto out_unlock;
1619
1620 switch (type) {
1621 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1622 retval = update_relax_domain_level(cs, val);
1623 break;
1624 default:
1625 retval = -EINVAL;
1626 break;
1627 }
1628out_unlock:
1629 mutex_unlock(&cpuset_mutex);
1630 return retval;
1631}
1632
1633
1634
1635
1636static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
1637 char *buf, size_t nbytes, loff_t off)
1638{
1639 struct cpuset *cs = css_cs(of_css(of));
1640 struct cpuset *trialcs;
1641 int retval = -ENODEV;
1642
1643 buf = strstrip(buf);
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664 css_get(&cs->css);
1665 kernfs_break_active_protection(of->kn);
1666 flush_work(&cpuset_hotplug_work);
1667
1668 mutex_lock(&cpuset_mutex);
1669 if (!is_cpuset_online(cs))
1670 goto out_unlock;
1671
1672 trialcs = alloc_trial_cpuset(cs);
1673 if (!trialcs) {
1674 retval = -ENOMEM;
1675 goto out_unlock;
1676 }
1677
1678 switch (of_cft(of)->private) {
1679 case FILE_CPULIST:
1680 retval = update_cpumask(cs, trialcs, buf);
1681 break;
1682 case FILE_MEMLIST:
1683 retval = update_nodemask(cs, trialcs, buf);
1684 break;
1685 default:
1686 retval = -EINVAL;
1687 break;
1688 }
1689
1690 free_trial_cpuset(trialcs);
1691out_unlock:
1692 mutex_unlock(&cpuset_mutex);
1693 kernfs_unbreak_active_protection(of->kn);
1694 css_put(&cs->css);
1695 return retval ?: nbytes;
1696}
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706static int cpuset_common_seq_show(struct seq_file *sf, void *v)
1707{
1708 struct cpuset *cs = css_cs(seq_css(sf));
1709 cpuset_filetype_t type = seq_cft(sf)->private;
1710 ssize_t count;
1711 char *buf, *s;
1712 int ret = 0;
1713
1714 count = seq_get_buf(sf, &buf);
1715 s = buf;
1716
1717 spin_lock_irq(&callback_lock);
1718
1719 switch (type) {
1720 case FILE_CPULIST:
1721 s += cpulist_scnprintf(s, count, cs->cpus_allowed);
1722 break;
1723 case FILE_MEMLIST:
1724 s += nodelist_scnprintf(s, count, cs->mems_allowed);
1725 break;
1726 case FILE_EFFECTIVE_CPULIST:
1727 s += cpulist_scnprintf(s, count, cs->effective_cpus);
1728 break;
1729 case FILE_EFFECTIVE_MEMLIST:
1730 s += nodelist_scnprintf(s, count, cs->effective_mems);
1731 break;
1732 default:
1733 ret = -EINVAL;
1734 goto out_unlock;
1735 }
1736
1737 if (s < buf + count - 1) {
1738 *s++ = '\n';
1739 seq_commit(sf, s - buf);
1740 } else {
1741 seq_commit(sf, -1);
1742 }
1743out_unlock:
1744 spin_unlock_irq(&callback_lock);
1745 return ret;
1746}
1747
1748static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
1749{
1750 struct cpuset *cs = css_cs(css);
1751 cpuset_filetype_t type = cft->private;
1752 switch (type) {
1753 case FILE_CPU_EXCLUSIVE:
1754 return is_cpu_exclusive(cs);
1755 case FILE_MEM_EXCLUSIVE:
1756 return is_mem_exclusive(cs);
1757 case FILE_MEM_HARDWALL:
1758 return is_mem_hardwall(cs);
1759 case FILE_SCHED_LOAD_BALANCE:
1760 return is_sched_load_balance(cs);
1761 case FILE_MEMORY_MIGRATE:
1762 return is_memory_migrate(cs);
1763 case FILE_MEMORY_PRESSURE_ENABLED:
1764 return cpuset_memory_pressure_enabled;
1765 case FILE_MEMORY_PRESSURE:
1766 return fmeter_getrate(&cs->fmeter);
1767 case FILE_SPREAD_PAGE:
1768 return is_spread_page(cs);
1769 case FILE_SPREAD_SLAB:
1770 return is_spread_slab(cs);
1771 default:
1772 BUG();
1773 }
1774
1775
1776 return 0;
1777}
1778
1779static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
1780{
1781 struct cpuset *cs = css_cs(css);
1782 cpuset_filetype_t type = cft->private;
1783 switch (type) {
1784 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1785 return cs->relax_domain_level;
1786 default:
1787 BUG();
1788 }
1789
1790
1791 return 0;
1792}
1793
1794
1795
1796
1797
1798
1799static struct cftype files[] = {
1800 {
1801 .name = "cpus",
1802 .seq_show = cpuset_common_seq_show,
1803 .write = cpuset_write_resmask,
1804 .max_write_len = (100U + 6 * NR_CPUS),
1805 .private = FILE_CPULIST,
1806 },
1807
1808 {
1809 .name = "mems",
1810 .seq_show = cpuset_common_seq_show,
1811 .write = cpuset_write_resmask,
1812 .max_write_len = (100U + 6 * MAX_NUMNODES),
1813 .private = FILE_MEMLIST,
1814 },
1815
1816 {
1817 .name = "effective_cpus",
1818 .seq_show = cpuset_common_seq_show,
1819 .private = FILE_EFFECTIVE_CPULIST,
1820 },
1821
1822 {
1823 .name = "effective_mems",
1824 .seq_show = cpuset_common_seq_show,
1825 .private = FILE_EFFECTIVE_MEMLIST,
1826 },
1827
1828 {
1829 .name = "cpu_exclusive",
1830 .read_u64 = cpuset_read_u64,
1831 .write_u64 = cpuset_write_u64,
1832 .private = FILE_CPU_EXCLUSIVE,
1833 },
1834
1835 {
1836 .name = "mem_exclusive",
1837 .read_u64 = cpuset_read_u64,
1838 .write_u64 = cpuset_write_u64,
1839 .private = FILE_MEM_EXCLUSIVE,
1840 },
1841
1842 {
1843 .name = "mem_hardwall",
1844 .read_u64 = cpuset_read_u64,
1845 .write_u64 = cpuset_write_u64,
1846 .private = FILE_MEM_HARDWALL,
1847 },
1848
1849 {
1850 .name = "sched_load_balance",
1851 .read_u64 = cpuset_read_u64,
1852 .write_u64 = cpuset_write_u64,
1853 .private = FILE_SCHED_LOAD_BALANCE,
1854 },
1855
1856 {
1857 .name = "sched_relax_domain_level",
1858 .read_s64 = cpuset_read_s64,
1859 .write_s64 = cpuset_write_s64,
1860 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
1861 },
1862
1863 {
1864 .name = "memory_migrate",
1865 .read_u64 = cpuset_read_u64,
1866 .write_u64 = cpuset_write_u64,
1867 .private = FILE_MEMORY_MIGRATE,
1868 },
1869
1870 {
1871 .name = "memory_pressure",
1872 .read_u64 = cpuset_read_u64,
1873 .write_u64 = cpuset_write_u64,
1874 .private = FILE_MEMORY_PRESSURE,
1875 .mode = S_IRUGO,
1876 },
1877
1878 {
1879 .name = "memory_spread_page",
1880 .read_u64 = cpuset_read_u64,
1881 .write_u64 = cpuset_write_u64,
1882 .private = FILE_SPREAD_PAGE,
1883 },
1884
1885 {
1886 .name = "memory_spread_slab",
1887 .read_u64 = cpuset_read_u64,
1888 .write_u64 = cpuset_write_u64,
1889 .private = FILE_SPREAD_SLAB,
1890 },
1891
1892 {
1893 .name = "memory_pressure_enabled",
1894 .flags = CFTYPE_ONLY_ON_ROOT,
1895 .read_u64 = cpuset_read_u64,
1896 .write_u64 = cpuset_write_u64,
1897 .private = FILE_MEMORY_PRESSURE_ENABLED,
1898 },
1899
1900 { }
1901};
1902
1903
1904
1905
1906
1907
1908static struct cgroup_subsys_state *
1909cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
1910{
1911 struct cpuset *cs;
1912
1913 if (!parent_css)
1914 return &top_cpuset.css;
1915
1916 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
1917 if (!cs)
1918 return ERR_PTR(-ENOMEM);
1919 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
1920 goto free_cs;
1921 if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
1922 goto free_cpus;
1923
1924 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1925 cpumask_clear(cs->cpus_allowed);
1926 nodes_clear(cs->mems_allowed);
1927 cpumask_clear(cs->effective_cpus);
1928 nodes_clear(cs->effective_mems);
1929 fmeter_init(&cs->fmeter);
1930 cs->relax_domain_level = -1;
1931
1932 return &cs->css;
1933
1934free_cpus:
1935 free_cpumask_var(cs->cpus_allowed);
1936free_cs:
1937 kfree(cs);
1938 return ERR_PTR(-ENOMEM);
1939}
1940
1941static int cpuset_css_online(struct cgroup_subsys_state *css)
1942{
1943 struct cpuset *cs = css_cs(css);
1944 struct cpuset *parent = parent_cs(cs);
1945 struct cpuset *tmp_cs;
1946 struct cgroup_subsys_state *pos_css;
1947
1948 if (!parent)
1949 return 0;
1950
1951 mutex_lock(&cpuset_mutex);
1952
1953 set_bit(CS_ONLINE, &cs->flags);
1954 if (is_spread_page(parent))
1955 set_bit(CS_SPREAD_PAGE, &cs->flags);
1956 if (is_spread_slab(parent))
1957 set_bit(CS_SPREAD_SLAB, &cs->flags);
1958
1959 cpuset_inc();
1960
1961 spin_lock_irq(&callback_lock);
1962 if (cgroup_on_dfl(cs->css.cgroup)) {
1963 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
1964 cs->effective_mems = parent->effective_mems;
1965 }
1966 spin_unlock_irq(&callback_lock);
1967
1968 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
1969 goto out_unlock;
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984 rcu_read_lock();
1985 cpuset_for_each_child(tmp_cs, pos_css, parent) {
1986 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
1987 rcu_read_unlock();
1988 goto out_unlock;
1989 }
1990 }
1991 rcu_read_unlock();
1992
1993 spin_lock_irq(&callback_lock);
1994 cs->mems_allowed = parent->mems_allowed;
1995 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
1996 spin_unlock_irq(&callback_lock);
1997out_unlock:
1998 mutex_unlock(&cpuset_mutex);
1999 return 0;
2000}
2001
2002
2003
2004
2005
2006
2007
2008static void cpuset_css_offline(struct cgroup_subsys_state *css)
2009{
2010 struct cpuset *cs = css_cs(css);
2011
2012 mutex_lock(&cpuset_mutex);
2013
2014 if (is_sched_load_balance(cs))
2015 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
2016
2017 cpuset_dec();
2018 clear_bit(CS_ONLINE, &cs->flags);
2019
2020 mutex_unlock(&cpuset_mutex);
2021}
2022
2023static void cpuset_css_free(struct cgroup_subsys_state *css)
2024{
2025 struct cpuset *cs = css_cs(css);
2026
2027 free_cpumask_var(cs->effective_cpus);
2028 free_cpumask_var(cs->cpus_allowed);
2029 kfree(cs);
2030}
2031
2032static void cpuset_bind(struct cgroup_subsys_state *root_css)
2033{
2034 mutex_lock(&cpuset_mutex);
2035 spin_lock_irq(&callback_lock);
2036
2037 if (cgroup_on_dfl(root_css->cgroup)) {
2038 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
2039 top_cpuset.mems_allowed = node_possible_map;
2040 } else {
2041 cpumask_copy(top_cpuset.cpus_allowed,
2042 top_cpuset.effective_cpus);
2043 top_cpuset.mems_allowed = top_cpuset.effective_mems;
2044 }
2045
2046 spin_unlock_irq(&callback_lock);
2047 mutex_unlock(&cpuset_mutex);
2048}
2049
2050struct cgroup_subsys cpuset_cgrp_subsys = {
2051 .css_alloc = cpuset_css_alloc,
2052 .css_online = cpuset_css_online,
2053 .css_offline = cpuset_css_offline,
2054 .css_free = cpuset_css_free,
2055 .can_attach = cpuset_can_attach,
2056 .cancel_attach = cpuset_cancel_attach,
2057 .attach = cpuset_attach,
2058 .bind = cpuset_bind,
2059 .legacy_cftypes = files,
2060 .early_init = 1,
2061};
2062
2063
2064
2065
2066
2067
2068
2069int __init cpuset_init(void)
2070{
2071 int err = 0;
2072
2073 if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
2074 BUG();
2075 if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
2076 BUG();
2077
2078 cpumask_setall(top_cpuset.cpus_allowed);
2079 nodes_setall(top_cpuset.mems_allowed);
2080 cpumask_setall(top_cpuset.effective_cpus);
2081 nodes_setall(top_cpuset.effective_mems);
2082
2083 fmeter_init(&top_cpuset.fmeter);
2084 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
2085 top_cpuset.relax_domain_level = -1;
2086
2087 err = register_filesystem(&cpuset_fs_type);
2088 if (err < 0)
2089 return err;
2090
2091 if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
2092 BUG();
2093
2094 return 0;
2095}
2096
2097
2098
2099
2100
2101
2102
2103
2104static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2105{
2106 struct cpuset *parent;
2107
2108
2109
2110
2111
2112 parent = parent_cs(cs);
2113 while (cpumask_empty(parent->cpus_allowed) ||
2114 nodes_empty(parent->mems_allowed))
2115 parent = parent_cs(parent);
2116
2117 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
2118 pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
2119 pr_cont_cgroup_name(cs->css.cgroup);
2120 pr_cont("\n");
2121 }
2122}
2123
2124static void
2125hotplug_update_tasks_legacy(struct cpuset *cs,
2126 struct cpumask *new_cpus, nodemask_t *new_mems,
2127 bool cpus_updated, bool mems_updated)
2128{
2129 bool is_empty;
2130
2131 spin_lock_irq(&callback_lock);
2132 cpumask_copy(cs->cpus_allowed, new_cpus);
2133 cpumask_copy(cs->effective_cpus, new_cpus);
2134 cs->mems_allowed = *new_mems;
2135 cs->effective_mems = *new_mems;
2136 spin_unlock_irq(&callback_lock);
2137
2138
2139
2140
2141
2142 if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
2143 update_tasks_cpumask(cs);
2144 if (mems_updated && !nodes_empty(cs->mems_allowed))
2145 update_tasks_nodemask(cs);
2146
2147 is_empty = cpumask_empty(cs->cpus_allowed) ||
2148 nodes_empty(cs->mems_allowed);
2149
2150 mutex_unlock(&cpuset_mutex);
2151
2152
2153
2154
2155
2156
2157 if (is_empty)
2158 remove_tasks_in_empty_cpuset(cs);
2159
2160 mutex_lock(&cpuset_mutex);
2161}
2162
2163static void
2164hotplug_update_tasks(struct cpuset *cs,
2165 struct cpumask *new_cpus, nodemask_t *new_mems,
2166 bool cpus_updated, bool mems_updated)
2167{
2168 if (cpumask_empty(new_cpus))
2169 cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
2170 if (nodes_empty(*new_mems))
2171 *new_mems = parent_cs(cs)->effective_mems;
2172
2173 spin_lock_irq(&callback_lock);
2174 cpumask_copy(cs->effective_cpus, new_cpus);
2175 cs->effective_mems = *new_mems;
2176 spin_unlock_irq(&callback_lock);
2177
2178 if (cpus_updated)
2179 update_tasks_cpumask(cs);
2180 if (mems_updated)
2181 update_tasks_nodemask(cs);
2182}
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192static void cpuset_hotplug_update_tasks(struct cpuset *cs)
2193{
2194 static cpumask_t new_cpus;
2195 static nodemask_t new_mems;
2196 bool cpus_updated;
2197 bool mems_updated;
2198retry:
2199 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
2200
2201 mutex_lock(&cpuset_mutex);
2202
2203
2204
2205
2206
2207 if (cs->attach_in_progress) {
2208 mutex_unlock(&cpuset_mutex);
2209 goto retry;
2210 }
2211
2212 cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
2213 nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
2214
2215 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
2216 mems_updated = !nodes_equal(new_mems, cs->effective_mems);
2217
2218 if (cgroup_on_dfl(cs->css.cgroup))
2219 hotplug_update_tasks(cs, &new_cpus, &new_mems,
2220 cpus_updated, mems_updated);
2221 else
2222 hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
2223 cpus_updated, mems_updated);
2224
2225 mutex_unlock(&cpuset_mutex);
2226}
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244static void cpuset_hotplug_workfn(struct work_struct *work)
2245{
2246 static cpumask_t new_cpus;
2247 static nodemask_t new_mems;
2248 bool cpus_updated, mems_updated;
2249 bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
2250
2251 mutex_lock(&cpuset_mutex);
2252
2253
2254 cpumask_copy(&new_cpus, cpu_active_mask);
2255 new_mems = node_states[N_MEMORY];
2256
2257 cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
2258 mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
2259
2260
2261 if (cpus_updated) {
2262 spin_lock_irq(&callback_lock);
2263 if (!on_dfl)
2264 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
2265 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
2266 spin_unlock_irq(&callback_lock);
2267
2268 }
2269
2270
2271 if (mems_updated) {
2272 spin_lock_irq(&callback_lock);
2273 if (!on_dfl)
2274 top_cpuset.mems_allowed = new_mems;
2275 top_cpuset.effective_mems = new_mems;
2276 spin_unlock_irq(&callback_lock);
2277 update_tasks_nodemask(&top_cpuset);
2278 }
2279
2280 mutex_unlock(&cpuset_mutex);
2281
2282
2283 if (cpus_updated || mems_updated) {
2284 struct cpuset *cs;
2285 struct cgroup_subsys_state *pos_css;
2286
2287 rcu_read_lock();
2288 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
2289 if (cs == &top_cpuset || !css_tryget_online(&cs->css))
2290 continue;
2291 rcu_read_unlock();
2292
2293 cpuset_hotplug_update_tasks(cs);
2294
2295 rcu_read_lock();
2296 css_put(&cs->css);
2297 }
2298 rcu_read_unlock();
2299 }
2300
2301
2302 if (cpus_updated)
2303 rebuild_sched_domains();
2304}
2305
2306void cpuset_update_active_cpus(bool cpu_online)
2307{
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318 partition_sched_domains(1, NULL, NULL);
2319 schedule_work(&cpuset_hotplug_work);
2320}
2321
2322
2323
2324
2325
2326
2327static int cpuset_track_online_nodes(struct notifier_block *self,
2328 unsigned long action, void *arg)
2329{
2330 schedule_work(&cpuset_hotplug_work);
2331 return NOTIFY_OK;
2332}
2333
2334static struct notifier_block cpuset_track_online_nodes_nb = {
2335 .notifier_call = cpuset_track_online_nodes,
2336 .priority = 10,
2337};
2338
2339
2340
2341
2342
2343
2344void __init cpuset_init_smp(void)
2345{
2346 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2347 top_cpuset.mems_allowed = node_states[N_MEMORY];
2348 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
2349
2350 cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
2351 top_cpuset.effective_mems = node_states[N_MEMORY];
2352
2353 register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
2354}
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2368{
2369 unsigned long flags;
2370
2371 spin_lock_irqsave(&callback_lock, flags);
2372 rcu_read_lock();
2373 guarantee_online_cpus(task_cs(tsk), pmask);
2374 rcu_read_unlock();
2375 spin_unlock_irqrestore(&callback_lock, flags);
2376}
2377
2378void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2379{
2380 rcu_read_lock();
2381 do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
2382 rcu_read_unlock();
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401}
2402
2403void cpuset_init_current_mems_allowed(void)
2404{
2405 nodes_setall(current->mems_allowed);
2406}
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2419{
2420 nodemask_t mask;
2421 unsigned long flags;
2422
2423 spin_lock_irqsave(&callback_lock, flags);
2424 rcu_read_lock();
2425 guarantee_online_mems(task_cs(tsk), &mask);
2426 rcu_read_unlock();
2427 spin_unlock_irqrestore(&callback_lock, flags);
2428
2429 return mask;
2430}
2431
2432
2433
2434
2435
2436
2437
2438int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2439{
2440 return nodes_intersects(*nodemask, current->mems_allowed);
2441}
2442
2443
2444
2445
2446
2447
2448
2449static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2450{
2451 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
2452 cs = parent_cs(cs);
2453 return cs;
2454}
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504int __cpuset_node_allowed(int node, gfp_t gfp_mask)
2505{
2506 struct cpuset *cs;
2507 int allowed;
2508 unsigned long flags;
2509
2510 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2511 return 1;
2512 if (node_isset(node, current->mems_allowed))
2513 return 1;
2514
2515
2516
2517
2518 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2519 return 1;
2520 if (gfp_mask & __GFP_HARDWALL)
2521 return 0;
2522
2523 if (current->flags & PF_EXITING)
2524 return 1;
2525
2526
2527 spin_lock_irqsave(&callback_lock, flags);
2528
2529 rcu_read_lock();
2530 cs = nearest_hardwall_ancestor(task_cs(current));
2531 allowed = node_isset(node, cs->mems_allowed);
2532 rcu_read_unlock();
2533
2534 spin_unlock_irqrestore(&callback_lock, flags);
2535 return allowed;
2536}
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565static int cpuset_spread_node(int *rotor)
2566{
2567 int node;
2568
2569 node = next_node(*rotor, current->mems_allowed);
2570 if (node == MAX_NUMNODES)
2571 node = first_node(current->mems_allowed);
2572 *rotor = node;
2573 return node;
2574}
2575
2576int cpuset_mem_spread_node(void)
2577{
2578 if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
2579 current->cpuset_mem_spread_rotor =
2580 node_random(¤t->mems_allowed);
2581
2582 return cpuset_spread_node(¤t->cpuset_mem_spread_rotor);
2583}
2584
2585int cpuset_slab_spread_node(void)
2586{
2587 if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
2588 current->cpuset_slab_spread_rotor =
2589 node_random(¤t->mems_allowed);
2590
2591 return cpuset_spread_node(¤t->cpuset_slab_spread_rotor);
2592}
2593
2594EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2608 const struct task_struct *tsk2)
2609{
2610 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2611}
2612
2613#define CPUSET_NODELIST_LEN (256)
2614
2615
2616
2617
2618
2619
2620
2621
2622void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2623{
2624
2625 static char cpuset_nodelist[CPUSET_NODELIST_LEN];
2626 static DEFINE_SPINLOCK(cpuset_buffer_lock);
2627 struct cgroup *cgrp;
2628
2629 spin_lock(&cpuset_buffer_lock);
2630 rcu_read_lock();
2631
2632 cgrp = task_cs(tsk)->css.cgroup;
2633 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2634 tsk->mems_allowed);
2635 pr_info("%s cpuset=", tsk->comm);
2636 pr_cont_cgroup_name(cgrp);
2637 pr_cont(" mems_allowed=%s\n", cpuset_nodelist);
2638
2639 rcu_read_unlock();
2640 spin_unlock(&cpuset_buffer_lock);
2641}
2642
2643
2644
2645
2646
2647
2648
2649int cpuset_memory_pressure_enabled __read_mostly;
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669void __cpuset_memory_pressure_bump(void)
2670{
2671 rcu_read_lock();
2672 fmeter_markevent(&task_cs(current)->fmeter);
2673 rcu_read_unlock();
2674}
2675
2676#ifdef CONFIG_PROC_PID_CPUSET
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
2687 struct pid *pid, struct task_struct *tsk)
2688{
2689 char *buf, *p;
2690 struct cgroup_subsys_state *css;
2691 int retval;
2692
2693 retval = -ENOMEM;
2694 buf = kmalloc(PATH_MAX, GFP_KERNEL);
2695 if (!buf)
2696 goto out;
2697
2698 retval = -ENAMETOOLONG;
2699 rcu_read_lock();
2700 css = task_css(tsk, cpuset_cgrp_id);
2701 p = cgroup_path(css->cgroup, buf, PATH_MAX);
2702 rcu_read_unlock();
2703 if (!p)
2704 goto out_free;
2705 seq_puts(m, p);
2706 seq_putc(m, '\n');
2707 retval = 0;
2708out_free:
2709 kfree(buf);
2710out:
2711 return retval;
2712}
2713#endif
2714
2715
2716void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2717{
2718 seq_puts(m, "Mems_allowed:\t");
2719 seq_nodemask(m, &task->mems_allowed);
2720 seq_puts(m, "\n");
2721 seq_puts(m, "Mems_allowed_list:\t");
2722 seq_nodemask_list(m, &task->mems_allowed);
2723 seq_puts(m, "\n");
2724}
2725