1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include <linux/cpu.h>
26#include <linux/cpumask.h>
27#include <linux/cpuset.h>
28#include <linux/err.h>
29#include <linux/errno.h>
30#include <linux/file.h>
31#include <linux/fs.h>
32#include <linux/init.h>
33#include <linux/interrupt.h>
34#include <linux/kernel.h>
35#include <linux/kmod.h>
36#include <linux/list.h>
37#include <linux/mempolicy.h>
38#include <linux/mm.h>
39#include <linux/memory.h>
40#include <linux/export.h>
41#include <linux/mount.h>
42#include <linux/namei.h>
43#include <linux/pagemap.h>
44#include <linux/proc_fs.h>
45#include <linux/rcupdate.h>
46#include <linux/sched.h>
47#include <linux/seq_file.h>
48#include <linux/security.h>
49#include <linux/slab.h>
50#include <linux/spinlock.h>
51#include <linux/stat.h>
52#include <linux/string.h>
53#include <linux/time.h>
54#include <linux/backing-dev.h>
55#include <linux/sort.h>
56
57#include <asm/uaccess.h>
58#include <linux/atomic.h>
59#include <linux/mutex.h>
60#include <linux/workqueue.h>
61#include <linux/cgroup.h>
62#include <linux/wait.h>
63
64struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
65
66
67
68struct fmeter {
69 int cnt;
70 int val;
71 time_t time;
72 spinlock_t lock;
73};
74
75struct cpuset {
76 struct cgroup_subsys_state css;
77
78 unsigned long flags;
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101 cpumask_var_t cpus_allowed;
102 nodemask_t mems_allowed;
103
104
105 cpumask_var_t effective_cpus;
106 nodemask_t effective_mems;
107
108
109
110
111
112
113
114
115
116
117
118 nodemask_t old_mems_allowed;
119
120 struct fmeter fmeter;
121
122
123
124
125
126 int attach_in_progress;
127
128
129 int pn;
130
131
132 int relax_domain_level;
133};
134
135static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
136{
137 return css ? container_of(css, struct cpuset, css) : NULL;
138}
139
140
141static inline struct cpuset *task_cs(struct task_struct *task)
142{
143 return css_cs(task_css(task, cpuset_cgrp_id));
144}
145
146static inline struct cpuset *parent_cs(struct cpuset *cs)
147{
148 return css_cs(cs->css.parent);
149}
150
151#ifdef CONFIG_NUMA
152static inline bool task_has_mempolicy(struct task_struct *task)
153{
154 return task->mempolicy;
155}
156#else
157static inline bool task_has_mempolicy(struct task_struct *task)
158{
159 return false;
160}
161#endif
162
163
164
165typedef enum {
166 CS_ONLINE,
167 CS_CPU_EXCLUSIVE,
168 CS_MEM_EXCLUSIVE,
169 CS_MEM_HARDWALL,
170 CS_MEMORY_MIGRATE,
171 CS_SCHED_LOAD_BALANCE,
172 CS_SPREAD_PAGE,
173 CS_SPREAD_SLAB,
174} cpuset_flagbits_t;
175
176
177static inline bool is_cpuset_online(const struct cpuset *cs)
178{
179 return test_bit(CS_ONLINE, &cs->flags);
180}
181
182static inline int is_cpu_exclusive(const struct cpuset *cs)
183{
184 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
185}
186
187static inline int is_mem_exclusive(const struct cpuset *cs)
188{
189 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
190}
191
192static inline int is_mem_hardwall(const struct cpuset *cs)
193{
194 return test_bit(CS_MEM_HARDWALL, &cs->flags);
195}
196
197static inline int is_sched_load_balance(const struct cpuset *cs)
198{
199 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
200}
201
202static inline int is_memory_migrate(const struct cpuset *cs)
203{
204 return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
205}
206
207static inline int is_spread_page(const struct cpuset *cs)
208{
209 return test_bit(CS_SPREAD_PAGE, &cs->flags);
210}
211
212static inline int is_spread_slab(const struct cpuset *cs)
213{
214 return test_bit(CS_SPREAD_SLAB, &cs->flags);
215}
216
217static struct cpuset top_cpuset = {
218 .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
219 (1 << CS_MEM_EXCLUSIVE)),
220};
221
222
223
224
225
226
227
228
229
230
231#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
232 css_for_each_child((pos_css), &(parent_cs)->css) \
233 if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
234
235
236
237
238
239
240
241
242
243
244
245
246#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
247 css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
248 if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286static DEFINE_MUTEX(cpuset_mutex);
287static DEFINE_SPINLOCK(callback_lock);
288
289
290
291
292static void cpuset_hotplug_workfn(struct work_struct *work);
293static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
294
295static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
296
297
298
299
300
301
302static struct dentry *cpuset_mount(struct file_system_type *fs_type,
303 int flags, const char *unused_dev_name, void *data)
304{
305 struct file_system_type *cgroup_fs = get_fs_type("cgroup");
306 struct dentry *ret = ERR_PTR(-ENODEV);
307 if (cgroup_fs) {
308 char mountopts[] =
309 "cpuset,noprefix,"
310 "release_agent=/sbin/cpuset_release_agent";
311 ret = cgroup_fs->mount(cgroup_fs, flags,
312 unused_dev_name, mountopts);
313 put_filesystem(cgroup_fs);
314 }
315 return ret;
316}
317
318static struct file_system_type cpuset_fs_type = {
319 .name = "cpuset",
320 .mount = cpuset_mount,
321};
322
323
324
325
326
327
328
329
330
331
332
333
334static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
335{
336 while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
337 cs = parent_cs(cs);
338 cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
339}
340
341
342
343
344
345
346
347
348
349
350
351
352static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
353{
354 while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
355 cs = parent_cs(cs);
356 nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
357}
358
359
360
361
362
363
364static void cpuset_update_task_spread_flag(struct cpuset *cs,
365 struct task_struct *tsk)
366{
367 if (is_spread_page(cs))
368 task_set_spread_page(tsk);
369 else
370 task_clear_spread_page(tsk);
371
372 if (is_spread_slab(cs))
373 task_set_spread_slab(tsk);
374 else
375 task_clear_spread_slab(tsk);
376}
377
378
379
380
381
382
383
384
385
386static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
387{
388 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
389 nodes_subset(p->mems_allowed, q->mems_allowed) &&
390 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
391 is_mem_exclusive(p) <= is_mem_exclusive(q);
392}
393
394
395
396
397
398static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
399{
400 struct cpuset *trial;
401
402 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
403 if (!trial)
404 return NULL;
405
406 if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
407 goto free_cs;
408 if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
409 goto free_cpus;
410
411 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
412 cpumask_copy(trial->effective_cpus, cs->effective_cpus);
413 return trial;
414
415free_cpus:
416 free_cpumask_var(trial->cpus_allowed);
417free_cs:
418 kfree(trial);
419 return NULL;
420}
421
422
423
424
425
426static void free_trial_cpuset(struct cpuset *trial)
427{
428 free_cpumask_var(trial->effective_cpus);
429 free_cpumask_var(trial->cpus_allowed);
430 kfree(trial);
431}
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453static int validate_change(struct cpuset *cur, struct cpuset *trial)
454{
455 struct cgroup_subsys_state *css;
456 struct cpuset *c, *par;
457 int ret;
458
459 rcu_read_lock();
460
461
462 ret = -EBUSY;
463 cpuset_for_each_child(c, css, cur)
464 if (!is_cpuset_subset(c, trial))
465 goto out;
466
467
468 ret = 0;
469 if (cur == &top_cpuset)
470 goto out;
471
472 par = parent_cs(cur);
473
474
475 ret = -EACCES;
476 if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
477 goto out;
478
479
480
481
482
483 ret = -EINVAL;
484 cpuset_for_each_child(c, css, par) {
485 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
486 c != cur &&
487 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
488 goto out;
489 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
490 c != cur &&
491 nodes_intersects(trial->mems_allowed, c->mems_allowed))
492 goto out;
493 }
494
495
496
497
498
499 ret = -ENOSPC;
500 if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) {
501 if (!cpumask_empty(cur->cpus_allowed) &&
502 cpumask_empty(trial->cpus_allowed))
503 goto out;
504 if (!nodes_empty(cur->mems_allowed) &&
505 nodes_empty(trial->mems_allowed))
506 goto out;
507 }
508
509
510
511
512
513 ret = -EBUSY;
514 if (is_cpu_exclusive(cur) &&
515 !cpuset_cpumask_can_shrink(cur->cpus_allowed,
516 trial->cpus_allowed))
517 goto out;
518
519 ret = 0;
520out:
521 rcu_read_unlock();
522 return ret;
523}
524
525#ifdef CONFIG_SMP
526
527
528
529
530static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
531{
532 return cpumask_intersects(a->effective_cpus, b->effective_cpus);
533}
534
535static void
536update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
537{
538 if (dattr->relax_domain_level < c->relax_domain_level)
539 dattr->relax_domain_level = c->relax_domain_level;
540 return;
541}
542
543static void update_domain_attr_tree(struct sched_domain_attr *dattr,
544 struct cpuset *root_cs)
545{
546 struct cpuset *cp;
547 struct cgroup_subsys_state *pos_css;
548
549 rcu_read_lock();
550 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
551
552 if (cpumask_empty(cp->cpus_allowed)) {
553 pos_css = css_rightmost_descendant(pos_css);
554 continue;
555 }
556
557 if (is_sched_load_balance(cp))
558 update_domain_attr(dattr, cp);
559 }
560 rcu_read_unlock();
561}
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617static int generate_sched_domains(cpumask_var_t **domains,
618 struct sched_domain_attr **attributes)
619{
620 struct cpuset *cp;
621 struct cpuset **csa;
622 int csn;
623 int i, j, k;
624 cpumask_var_t *doms;
625 struct sched_domain_attr *dattr;
626 int ndoms = 0;
627 int nslot;
628 struct cgroup_subsys_state *pos_css;
629
630 doms = NULL;
631 dattr = NULL;
632 csa = NULL;
633
634
635 if (is_sched_load_balance(&top_cpuset)) {
636 ndoms = 1;
637 doms = alloc_sched_domains(ndoms);
638 if (!doms)
639 goto done;
640
641 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
642 if (dattr) {
643 *dattr = SD_ATTR_INIT;
644 update_domain_attr_tree(dattr, &top_cpuset);
645 }
646 cpumask_copy(doms[0], top_cpuset.effective_cpus);
647
648 goto done;
649 }
650
651 csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);
652 if (!csa)
653 goto done;
654 csn = 0;
655
656 rcu_read_lock();
657 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
658 if (cp == &top_cpuset)
659 continue;
660
661
662
663
664
665
666
667
668 if (!cpumask_empty(cp->cpus_allowed) &&
669 !is_sched_load_balance(cp))
670 continue;
671
672 if (is_sched_load_balance(cp))
673 csa[csn++] = cp;
674
675
676 pos_css = css_rightmost_descendant(pos_css);
677 }
678 rcu_read_unlock();
679
680 for (i = 0; i < csn; i++)
681 csa[i]->pn = i;
682 ndoms = csn;
683
684restart:
685
686 for (i = 0; i < csn; i++) {
687 struct cpuset *a = csa[i];
688 int apn = a->pn;
689
690 for (j = 0; j < csn; j++) {
691 struct cpuset *b = csa[j];
692 int bpn = b->pn;
693
694 if (apn != bpn && cpusets_overlap(a, b)) {
695 for (k = 0; k < csn; k++) {
696 struct cpuset *c = csa[k];
697
698 if (c->pn == bpn)
699 c->pn = apn;
700 }
701 ndoms--;
702 goto restart;
703 }
704 }
705 }
706
707
708
709
710
711 doms = alloc_sched_domains(ndoms);
712 if (!doms)
713 goto done;
714
715
716
717
718
719 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
720
721 for (nslot = 0, i = 0; i < csn; i++) {
722 struct cpuset *a = csa[i];
723 struct cpumask *dp;
724 int apn = a->pn;
725
726 if (apn < 0) {
727
728 continue;
729 }
730
731 dp = doms[nslot];
732
733 if (nslot == ndoms) {
734 static int warnings = 10;
735 if (warnings) {
736 pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
737 nslot, ndoms, csn, i, apn);
738 warnings--;
739 }
740 continue;
741 }
742
743 cpumask_clear(dp);
744 if (dattr)
745 *(dattr + nslot) = SD_ATTR_INIT;
746 for (j = i; j < csn; j++) {
747 struct cpuset *b = csa[j];
748
749 if (apn == b->pn) {
750 cpumask_or(dp, dp, b->effective_cpus);
751 if (dattr)
752 update_domain_attr_tree(dattr + nslot, b);
753
754
755 b->pn = -1;
756 }
757 }
758 nslot++;
759 }
760 BUG_ON(nslot != ndoms);
761
762done:
763 kfree(csa);
764
765
766
767
768
769 if (doms == NULL)
770 ndoms = 1;
771
772 *domains = doms;
773 *attributes = dattr;
774 return ndoms;
775}
776
777
778
779
780
781
782
783
784
785
786
787
788static void rebuild_sched_domains_locked(void)
789{
790 struct sched_domain_attr *attr;
791 cpumask_var_t *doms;
792 int ndoms;
793
794 lockdep_assert_held(&cpuset_mutex);
795 get_online_cpus();
796
797
798
799
800
801
802 if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
803 goto out;
804
805
806 ndoms = generate_sched_domains(&doms, &attr);
807
808
809 partition_sched_domains(ndoms, doms, attr);
810out:
811 put_online_cpus();
812}
813#else
814static void rebuild_sched_domains_locked(void)
815{
816}
817#endif
818
819void rebuild_sched_domains(void)
820{
821 mutex_lock(&cpuset_mutex);
822 rebuild_sched_domains_locked();
823 mutex_unlock(&cpuset_mutex);
824}
825
826
827
828
829
830
831
832
833
834static void update_tasks_cpumask(struct cpuset *cs)
835{
836 struct css_task_iter it;
837 struct task_struct *task;
838
839 css_task_iter_start(&cs->css, &it);
840 while ((task = css_task_iter_next(&it)))
841 set_cpus_allowed_ptr(task, cs->effective_cpus);
842 css_task_iter_end(&it);
843}
844
845
846
847
848
849
850
851
852
853
854
855
856
857static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
858{
859 struct cpuset *cp;
860 struct cgroup_subsys_state *pos_css;
861 bool need_rebuild_sched_domains = false;
862
863 rcu_read_lock();
864 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
865 struct cpuset *parent = parent_cs(cp);
866
867 cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
868
869
870
871
872
873 if (cgroup_on_dfl(cp->css.cgroup) && cpumask_empty(new_cpus))
874 cpumask_copy(new_cpus, parent->effective_cpus);
875
876
877 if (cpumask_equal(new_cpus, cp->effective_cpus)) {
878 pos_css = css_rightmost_descendant(pos_css);
879 continue;
880 }
881
882 if (!css_tryget_online(&cp->css))
883 continue;
884 rcu_read_unlock();
885
886 spin_lock_irq(&callback_lock);
887 cpumask_copy(cp->effective_cpus, new_cpus);
888 spin_unlock_irq(&callback_lock);
889
890 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
891 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
892
893 update_tasks_cpumask(cp);
894
895
896
897
898
899 if (!cpumask_empty(cp->cpus_allowed) &&
900 is_sched_load_balance(cp))
901 need_rebuild_sched_domains = true;
902
903 rcu_read_lock();
904 css_put(&cp->css);
905 }
906 rcu_read_unlock();
907
908 if (need_rebuild_sched_domains)
909 rebuild_sched_domains_locked();
910}
911
912
913
914
915
916
917
918static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
919 const char *buf)
920{
921 int retval;
922
923
924 if (cs == &top_cpuset)
925 return -EACCES;
926
927
928
929
930
931
932
933 if (!*buf) {
934 cpumask_clear(trialcs->cpus_allowed);
935 } else {
936 retval = cpulist_parse(buf, trialcs->cpus_allowed);
937 if (retval < 0)
938 return retval;
939
940 if (!cpumask_subset(trialcs->cpus_allowed,
941 top_cpuset.cpus_allowed))
942 return -EINVAL;
943 }
944
945
946 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
947 return 0;
948
949 retval = validate_change(cs, trialcs);
950 if (retval < 0)
951 return retval;
952
953 spin_lock_irq(&callback_lock);
954 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
955 spin_unlock_irq(&callback_lock);
956
957
958 update_cpumasks_hier(cs, trialcs->cpus_allowed);
959 return 0;
960}
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
977 const nodemask_t *to)
978{
979 struct task_struct *tsk = current;
980
981 tsk->mems_allowed = *to;
982
983 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
984
985 rcu_read_lock();
986 guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
987 rcu_read_unlock();
988}
989
990
991
992
993
994
995
996
997
998
999static void cpuset_change_task_nodemask(struct task_struct *tsk,
1000 nodemask_t *newmems)
1001{
1002 bool need_loop;
1003
1004
1005
1006
1007
1008 if (unlikely(test_thread_flag(TIF_MEMDIE)))
1009 return;
1010 if (current->flags & PF_EXITING)
1011 return;
1012
1013 task_lock(tsk);
1014
1015
1016
1017
1018
1019
1020 need_loop = task_has_mempolicy(tsk) ||
1021 !nodes_intersects(*newmems, tsk->mems_allowed);
1022
1023 if (need_loop) {
1024 local_irq_disable();
1025 write_seqcount_begin(&tsk->mems_allowed_seq);
1026 }
1027
1028 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
1029 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
1030
1031 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
1032 tsk->mems_allowed = *newmems;
1033
1034 if (need_loop) {
1035 write_seqcount_end(&tsk->mems_allowed_seq);
1036 local_irq_enable();
1037 }
1038
1039 task_unlock(tsk);
1040}
1041
1042static void *cpuset_being_rebound;
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052static void update_tasks_nodemask(struct cpuset *cs)
1053{
1054 static nodemask_t newmems;
1055 struct css_task_iter it;
1056 struct task_struct *task;
1057
1058 cpuset_being_rebound = cs;
1059
1060 guarantee_online_mems(cs, &newmems);
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072 css_task_iter_start(&cs->css, &it);
1073 while ((task = css_task_iter_next(&it))) {
1074 struct mm_struct *mm;
1075 bool migrate;
1076
1077 cpuset_change_task_nodemask(task, &newmems);
1078
1079 mm = get_task_mm(task);
1080 if (!mm)
1081 continue;
1082
1083 migrate = is_memory_migrate(cs);
1084
1085 mpol_rebind_mm(mm, &cs->mems_allowed);
1086 if (migrate)
1087 cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
1088 mmput(mm);
1089 }
1090 css_task_iter_end(&it);
1091
1092
1093
1094
1095
1096 cs->old_mems_allowed = newmems;
1097
1098
1099 cpuset_being_rebound = NULL;
1100}
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1115{
1116 struct cpuset *cp;
1117 struct cgroup_subsys_state *pos_css;
1118
1119 rcu_read_lock();
1120 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1121 struct cpuset *parent = parent_cs(cp);
1122
1123 nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
1124
1125
1126
1127
1128
1129 if (cgroup_on_dfl(cp->css.cgroup) && nodes_empty(*new_mems))
1130 *new_mems = parent->effective_mems;
1131
1132
1133 if (nodes_equal(*new_mems, cp->effective_mems)) {
1134 pos_css = css_rightmost_descendant(pos_css);
1135 continue;
1136 }
1137
1138 if (!css_tryget_online(&cp->css))
1139 continue;
1140 rcu_read_unlock();
1141
1142 spin_lock_irq(&callback_lock);
1143 cp->effective_mems = *new_mems;
1144 spin_unlock_irq(&callback_lock);
1145
1146 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
1147 !nodes_equal(cp->mems_allowed, cp->effective_mems));
1148
1149 update_tasks_nodemask(cp);
1150
1151 rcu_read_lock();
1152 css_put(&cp->css);
1153 }
1154 rcu_read_unlock();
1155}
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1171 const char *buf)
1172{
1173 int retval;
1174
1175
1176
1177
1178
1179 if (cs == &top_cpuset) {
1180 retval = -EACCES;
1181 goto done;
1182 }
1183
1184
1185
1186
1187
1188
1189
1190 if (!*buf) {
1191 nodes_clear(trialcs->mems_allowed);
1192 } else {
1193 retval = nodelist_parse(buf, trialcs->mems_allowed);
1194 if (retval < 0)
1195 goto done;
1196
1197 if (!nodes_subset(trialcs->mems_allowed,
1198 top_cpuset.mems_allowed)) {
1199 retval = -EINVAL;
1200 goto done;
1201 }
1202 }
1203
1204 if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
1205 retval = 0;
1206 goto done;
1207 }
1208 retval = validate_change(cs, trialcs);
1209 if (retval < 0)
1210 goto done;
1211
1212 spin_lock_irq(&callback_lock);
1213 cs->mems_allowed = trialcs->mems_allowed;
1214 spin_unlock_irq(&callback_lock);
1215
1216
1217 update_nodemasks_hier(cs, &cs->mems_allowed);
1218done:
1219 return retval;
1220}
1221
1222int current_cpuset_is_being_rebound(void)
1223{
1224 int ret;
1225
1226 rcu_read_lock();
1227 ret = task_cs(current) == cpuset_being_rebound;
1228 rcu_read_unlock();
1229
1230 return ret;
1231}
1232
1233static int update_relax_domain_level(struct cpuset *cs, s64 val)
1234{
1235#ifdef CONFIG_SMP
1236 if (val < -1 || val >= sched_domain_level_max)
1237 return -EINVAL;
1238#endif
1239
1240 if (val != cs->relax_domain_level) {
1241 cs->relax_domain_level = val;
1242 if (!cpumask_empty(cs->cpus_allowed) &&
1243 is_sched_load_balance(cs))
1244 rebuild_sched_domains_locked();
1245 }
1246
1247 return 0;
1248}
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258static void update_tasks_flags(struct cpuset *cs)
1259{
1260 struct css_task_iter it;
1261 struct task_struct *task;
1262
1263 css_task_iter_start(&cs->css, &it);
1264 while ((task = css_task_iter_next(&it)))
1265 cpuset_update_task_spread_flag(cs, task);
1266 css_task_iter_end(&it);
1267}
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1279 int turning_on)
1280{
1281 struct cpuset *trialcs;
1282 int balance_flag_changed;
1283 int spread_flag_changed;
1284 int err;
1285
1286 trialcs = alloc_trial_cpuset(cs);
1287 if (!trialcs)
1288 return -ENOMEM;
1289
1290 if (turning_on)
1291 set_bit(bit, &trialcs->flags);
1292 else
1293 clear_bit(bit, &trialcs->flags);
1294
1295 err = validate_change(cs, trialcs);
1296 if (err < 0)
1297 goto out;
1298
1299 balance_flag_changed = (is_sched_load_balance(cs) !=
1300 is_sched_load_balance(trialcs));
1301
1302 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1303 || (is_spread_page(cs) != is_spread_page(trialcs)));
1304
1305 spin_lock_irq(&callback_lock);
1306 cs->flags = trialcs->flags;
1307 spin_unlock_irq(&callback_lock);
1308
1309 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1310 rebuild_sched_domains_locked();
1311
1312 if (spread_flag_changed)
1313 update_tasks_flags(cs);
1314out:
1315 free_trial_cpuset(trialcs);
1316 return err;
1317}
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364#define FM_COEF 933
1365#define FM_MAXTICKS ((time_t)99)
1366#define FM_MAXCNT 1000000
1367#define FM_SCALE 1000
1368
1369
1370static void fmeter_init(struct fmeter *fmp)
1371{
1372 fmp->cnt = 0;
1373 fmp->val = 0;
1374 fmp->time = 0;
1375 spin_lock_init(&fmp->lock);
1376}
1377
1378
1379static void fmeter_update(struct fmeter *fmp)
1380{
1381 time_t now = get_seconds();
1382 time_t ticks = now - fmp->time;
1383
1384 if (ticks == 0)
1385 return;
1386
1387 ticks = min(FM_MAXTICKS, ticks);
1388 while (ticks-- > 0)
1389 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
1390 fmp->time = now;
1391
1392 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
1393 fmp->cnt = 0;
1394}
1395
1396
1397static void fmeter_markevent(struct fmeter *fmp)
1398{
1399 spin_lock(&fmp->lock);
1400 fmeter_update(fmp);
1401 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
1402 spin_unlock(&fmp->lock);
1403}
1404
1405
1406static int fmeter_getrate(struct fmeter *fmp)
1407{
1408 int val;
1409
1410 spin_lock(&fmp->lock);
1411 fmeter_update(fmp);
1412 val = fmp->val;
1413 spin_unlock(&fmp->lock);
1414 return val;
1415}
1416
1417static struct cpuset *cpuset_attach_old_cs;
1418
1419
1420static int cpuset_can_attach(struct cgroup_subsys_state *css,
1421 struct cgroup_taskset *tset)
1422{
1423 struct cpuset *cs = css_cs(css);
1424 struct task_struct *task;
1425 int ret;
1426
1427
1428 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset));
1429
1430 mutex_lock(&cpuset_mutex);
1431
1432
1433 ret = -ENOSPC;
1434 if (!cgroup_on_dfl(css->cgroup) &&
1435 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1436 goto out_unlock;
1437
1438 cgroup_taskset_for_each(task, tset) {
1439 ret = task_can_attach(task, cs->cpus_allowed);
1440 if (ret)
1441 goto out_unlock;
1442 ret = security_task_setscheduler(task);
1443 if (ret)
1444 goto out_unlock;
1445 }
1446
1447
1448
1449
1450
1451 cs->attach_in_progress++;
1452 ret = 0;
1453out_unlock:
1454 mutex_unlock(&cpuset_mutex);
1455 return ret;
1456}
1457
1458static void cpuset_cancel_attach(struct cgroup_subsys_state *css,
1459 struct cgroup_taskset *tset)
1460{
1461 mutex_lock(&cpuset_mutex);
1462 css_cs(css)->attach_in_progress--;
1463 mutex_unlock(&cpuset_mutex);
1464}
1465
1466
1467
1468
1469
1470
1471static cpumask_var_t cpus_attach;
1472
1473static void cpuset_attach(struct cgroup_subsys_state *css,
1474 struct cgroup_taskset *tset)
1475{
1476
1477 static nodemask_t cpuset_attach_nodemask_to;
1478 struct mm_struct *mm;
1479 struct task_struct *task;
1480 struct task_struct *leader = cgroup_taskset_first(tset);
1481 struct cpuset *cs = css_cs(css);
1482 struct cpuset *oldcs = cpuset_attach_old_cs;
1483
1484 mutex_lock(&cpuset_mutex);
1485
1486
1487 if (cs == &top_cpuset)
1488 cpumask_copy(cpus_attach, cpu_possible_mask);
1489 else
1490 guarantee_online_cpus(cs, cpus_attach);
1491
1492 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
1493
1494 cgroup_taskset_for_each(task, tset) {
1495
1496
1497
1498
1499 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
1500
1501 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
1502 cpuset_update_task_spread_flag(cs, task);
1503 }
1504
1505
1506
1507
1508
1509 cpuset_attach_nodemask_to = cs->effective_mems;
1510 mm = get_task_mm(leader);
1511 if (mm) {
1512 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1513
1514
1515
1516
1517
1518
1519
1520
1521 if (is_memory_migrate(cs)) {
1522 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
1523 &cpuset_attach_nodemask_to);
1524 }
1525 mmput(mm);
1526 }
1527
1528 cs->old_mems_allowed = cpuset_attach_nodemask_to;
1529
1530 cs->attach_in_progress--;
1531 if (!cs->attach_in_progress)
1532 wake_up(&cpuset_attach_wq);
1533
1534 mutex_unlock(&cpuset_mutex);
1535}
1536
1537
1538
1539typedef enum {
1540 FILE_MEMORY_MIGRATE,
1541 FILE_CPULIST,
1542 FILE_MEMLIST,
1543 FILE_EFFECTIVE_CPULIST,
1544 FILE_EFFECTIVE_MEMLIST,
1545 FILE_CPU_EXCLUSIVE,
1546 FILE_MEM_EXCLUSIVE,
1547 FILE_MEM_HARDWALL,
1548 FILE_SCHED_LOAD_BALANCE,
1549 FILE_SCHED_RELAX_DOMAIN_LEVEL,
1550 FILE_MEMORY_PRESSURE_ENABLED,
1551 FILE_MEMORY_PRESSURE,
1552 FILE_SPREAD_PAGE,
1553 FILE_SPREAD_SLAB,
1554} cpuset_filetype_t;
1555
1556static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
1557 u64 val)
1558{
1559 struct cpuset *cs = css_cs(css);
1560 cpuset_filetype_t type = cft->private;
1561 int retval = 0;
1562
1563 mutex_lock(&cpuset_mutex);
1564 if (!is_cpuset_online(cs)) {
1565 retval = -ENODEV;
1566 goto out_unlock;
1567 }
1568
1569 switch (type) {
1570 case FILE_CPU_EXCLUSIVE:
1571 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
1572 break;
1573 case FILE_MEM_EXCLUSIVE:
1574 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
1575 break;
1576 case FILE_MEM_HARDWALL:
1577 retval = update_flag(CS_MEM_HARDWALL, cs, val);
1578 break;
1579 case FILE_SCHED_LOAD_BALANCE:
1580 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
1581 break;
1582 case FILE_MEMORY_MIGRATE:
1583 retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
1584 break;
1585 case FILE_MEMORY_PRESSURE_ENABLED:
1586 cpuset_memory_pressure_enabled = !!val;
1587 break;
1588 case FILE_MEMORY_PRESSURE:
1589 retval = -EACCES;
1590 break;
1591 case FILE_SPREAD_PAGE:
1592 retval = update_flag(CS_SPREAD_PAGE, cs, val);
1593 break;
1594 case FILE_SPREAD_SLAB:
1595 retval = update_flag(CS_SPREAD_SLAB, cs, val);
1596 break;
1597 default:
1598 retval = -EINVAL;
1599 break;
1600 }
1601out_unlock:
1602 mutex_unlock(&cpuset_mutex);
1603 return retval;
1604}
1605
1606static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
1607 s64 val)
1608{
1609 struct cpuset *cs = css_cs(css);
1610 cpuset_filetype_t type = cft->private;
1611 int retval = -ENODEV;
1612
1613 mutex_lock(&cpuset_mutex);
1614 if (!is_cpuset_online(cs))
1615 goto out_unlock;
1616
1617 switch (type) {
1618 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1619 retval = update_relax_domain_level(cs, val);
1620 break;
1621 default:
1622 retval = -EINVAL;
1623 break;
1624 }
1625out_unlock:
1626 mutex_unlock(&cpuset_mutex);
1627 return retval;
1628}
1629
1630
1631
1632
1633static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
1634 char *buf, size_t nbytes, loff_t off)
1635{
1636 struct cpuset *cs = css_cs(of_css(of));
1637 struct cpuset *trialcs;
1638 int retval = -ENODEV;
1639
1640 buf = strstrip(buf);
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661 css_get(&cs->css);
1662 kernfs_break_active_protection(of->kn);
1663 flush_work(&cpuset_hotplug_work);
1664
1665 mutex_lock(&cpuset_mutex);
1666 if (!is_cpuset_online(cs))
1667 goto out_unlock;
1668
1669 trialcs = alloc_trial_cpuset(cs);
1670 if (!trialcs) {
1671 retval = -ENOMEM;
1672 goto out_unlock;
1673 }
1674
1675 switch (of_cft(of)->private) {
1676 case FILE_CPULIST:
1677 retval = update_cpumask(cs, trialcs, buf);
1678 break;
1679 case FILE_MEMLIST:
1680 retval = update_nodemask(cs, trialcs, buf);
1681 break;
1682 default:
1683 retval = -EINVAL;
1684 break;
1685 }
1686
1687 free_trial_cpuset(trialcs);
1688out_unlock:
1689 mutex_unlock(&cpuset_mutex);
1690 kernfs_unbreak_active_protection(of->kn);
1691 css_put(&cs->css);
1692 return retval ?: nbytes;
1693}
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703static int cpuset_common_seq_show(struct seq_file *sf, void *v)
1704{
1705 struct cpuset *cs = css_cs(seq_css(sf));
1706 cpuset_filetype_t type = seq_cft(sf)->private;
1707 int ret = 0;
1708
1709 spin_lock_irq(&callback_lock);
1710
1711 switch (type) {
1712 case FILE_CPULIST:
1713 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
1714 break;
1715 case FILE_MEMLIST:
1716 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
1717 break;
1718 case FILE_EFFECTIVE_CPULIST:
1719 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
1720 break;
1721 case FILE_EFFECTIVE_MEMLIST:
1722 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
1723 break;
1724 default:
1725 ret = -EINVAL;
1726 }
1727
1728 spin_unlock_irq(&callback_lock);
1729 return ret;
1730}
1731
1732static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
1733{
1734 struct cpuset *cs = css_cs(css);
1735 cpuset_filetype_t type = cft->private;
1736 switch (type) {
1737 case FILE_CPU_EXCLUSIVE:
1738 return is_cpu_exclusive(cs);
1739 case FILE_MEM_EXCLUSIVE:
1740 return is_mem_exclusive(cs);
1741 case FILE_MEM_HARDWALL:
1742 return is_mem_hardwall(cs);
1743 case FILE_SCHED_LOAD_BALANCE:
1744 return is_sched_load_balance(cs);
1745 case FILE_MEMORY_MIGRATE:
1746 return is_memory_migrate(cs);
1747 case FILE_MEMORY_PRESSURE_ENABLED:
1748 return cpuset_memory_pressure_enabled;
1749 case FILE_MEMORY_PRESSURE:
1750 return fmeter_getrate(&cs->fmeter);
1751 case FILE_SPREAD_PAGE:
1752 return is_spread_page(cs);
1753 case FILE_SPREAD_SLAB:
1754 return is_spread_slab(cs);
1755 default:
1756 BUG();
1757 }
1758
1759
1760 return 0;
1761}
1762
1763static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
1764{
1765 struct cpuset *cs = css_cs(css);
1766 cpuset_filetype_t type = cft->private;
1767 switch (type) {
1768 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1769 return cs->relax_domain_level;
1770 default:
1771 BUG();
1772 }
1773
1774
1775 return 0;
1776}
1777
1778
1779
1780
1781
1782
1783static struct cftype files[] = {
1784 {
1785 .name = "cpus",
1786 .seq_show = cpuset_common_seq_show,
1787 .write = cpuset_write_resmask,
1788 .max_write_len = (100U + 6 * NR_CPUS),
1789 .private = FILE_CPULIST,
1790 },
1791
1792 {
1793 .name = "mems",
1794 .seq_show = cpuset_common_seq_show,
1795 .write = cpuset_write_resmask,
1796 .max_write_len = (100U + 6 * MAX_NUMNODES),
1797 .private = FILE_MEMLIST,
1798 },
1799
1800 {
1801 .name = "effective_cpus",
1802 .seq_show = cpuset_common_seq_show,
1803 .private = FILE_EFFECTIVE_CPULIST,
1804 },
1805
1806 {
1807 .name = "effective_mems",
1808 .seq_show = cpuset_common_seq_show,
1809 .private = FILE_EFFECTIVE_MEMLIST,
1810 },
1811
1812 {
1813 .name = "cpu_exclusive",
1814 .read_u64 = cpuset_read_u64,
1815 .write_u64 = cpuset_write_u64,
1816 .private = FILE_CPU_EXCLUSIVE,
1817 },
1818
1819 {
1820 .name = "mem_exclusive",
1821 .read_u64 = cpuset_read_u64,
1822 .write_u64 = cpuset_write_u64,
1823 .private = FILE_MEM_EXCLUSIVE,
1824 },
1825
1826 {
1827 .name = "mem_hardwall",
1828 .read_u64 = cpuset_read_u64,
1829 .write_u64 = cpuset_write_u64,
1830 .private = FILE_MEM_HARDWALL,
1831 },
1832
1833 {
1834 .name = "sched_load_balance",
1835 .read_u64 = cpuset_read_u64,
1836 .write_u64 = cpuset_write_u64,
1837 .private = FILE_SCHED_LOAD_BALANCE,
1838 },
1839
1840 {
1841 .name = "sched_relax_domain_level",
1842 .read_s64 = cpuset_read_s64,
1843 .write_s64 = cpuset_write_s64,
1844 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
1845 },
1846
1847 {
1848 .name = "memory_migrate",
1849 .read_u64 = cpuset_read_u64,
1850 .write_u64 = cpuset_write_u64,
1851 .private = FILE_MEMORY_MIGRATE,
1852 },
1853
1854 {
1855 .name = "memory_pressure",
1856 .read_u64 = cpuset_read_u64,
1857 .write_u64 = cpuset_write_u64,
1858 .private = FILE_MEMORY_PRESSURE,
1859 .mode = S_IRUGO,
1860 },
1861
1862 {
1863 .name = "memory_spread_page",
1864 .read_u64 = cpuset_read_u64,
1865 .write_u64 = cpuset_write_u64,
1866 .private = FILE_SPREAD_PAGE,
1867 },
1868
1869 {
1870 .name = "memory_spread_slab",
1871 .read_u64 = cpuset_read_u64,
1872 .write_u64 = cpuset_write_u64,
1873 .private = FILE_SPREAD_SLAB,
1874 },
1875
1876 {
1877 .name = "memory_pressure_enabled",
1878 .flags = CFTYPE_ONLY_ON_ROOT,
1879 .read_u64 = cpuset_read_u64,
1880 .write_u64 = cpuset_write_u64,
1881 .private = FILE_MEMORY_PRESSURE_ENABLED,
1882 },
1883
1884 { }
1885};
1886
1887
1888
1889
1890
1891
1892static struct cgroup_subsys_state *
1893cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
1894{
1895 struct cpuset *cs;
1896
1897 if (!parent_css)
1898 return &top_cpuset.css;
1899
1900 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
1901 if (!cs)
1902 return ERR_PTR(-ENOMEM);
1903 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
1904 goto free_cs;
1905 if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
1906 goto free_cpus;
1907
1908 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1909 cpumask_clear(cs->cpus_allowed);
1910 nodes_clear(cs->mems_allowed);
1911 cpumask_clear(cs->effective_cpus);
1912 nodes_clear(cs->effective_mems);
1913 fmeter_init(&cs->fmeter);
1914 cs->relax_domain_level = -1;
1915
1916 return &cs->css;
1917
1918free_cpus:
1919 free_cpumask_var(cs->cpus_allowed);
1920free_cs:
1921 kfree(cs);
1922 return ERR_PTR(-ENOMEM);
1923}
1924
1925static int cpuset_css_online(struct cgroup_subsys_state *css)
1926{
1927 struct cpuset *cs = css_cs(css);
1928 struct cpuset *parent = parent_cs(cs);
1929 struct cpuset *tmp_cs;
1930 struct cgroup_subsys_state *pos_css;
1931
1932 if (!parent)
1933 return 0;
1934
1935 mutex_lock(&cpuset_mutex);
1936
1937 set_bit(CS_ONLINE, &cs->flags);
1938 if (is_spread_page(parent))
1939 set_bit(CS_SPREAD_PAGE, &cs->flags);
1940 if (is_spread_slab(parent))
1941 set_bit(CS_SPREAD_SLAB, &cs->flags);
1942
1943 cpuset_inc();
1944
1945 spin_lock_irq(&callback_lock);
1946 if (cgroup_on_dfl(cs->css.cgroup)) {
1947 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
1948 cs->effective_mems = parent->effective_mems;
1949 }
1950 spin_unlock_irq(&callback_lock);
1951
1952 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
1953 goto out_unlock;
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968 rcu_read_lock();
1969 cpuset_for_each_child(tmp_cs, pos_css, parent) {
1970 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
1971 rcu_read_unlock();
1972 goto out_unlock;
1973 }
1974 }
1975 rcu_read_unlock();
1976
1977 spin_lock_irq(&callback_lock);
1978 cs->mems_allowed = parent->mems_allowed;
1979 cs->effective_mems = parent->mems_allowed;
1980 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
1981 cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
1982 spin_unlock_irq(&callback_lock);
1983out_unlock:
1984 mutex_unlock(&cpuset_mutex);
1985 return 0;
1986}
1987
1988
1989
1990
1991
1992
1993
1994static void cpuset_css_offline(struct cgroup_subsys_state *css)
1995{
1996 struct cpuset *cs = css_cs(css);
1997
1998 mutex_lock(&cpuset_mutex);
1999
2000 if (is_sched_load_balance(cs))
2001 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
2002
2003 cpuset_dec();
2004 clear_bit(CS_ONLINE, &cs->flags);
2005
2006 mutex_unlock(&cpuset_mutex);
2007}
2008
2009static void cpuset_css_free(struct cgroup_subsys_state *css)
2010{
2011 struct cpuset *cs = css_cs(css);
2012
2013 free_cpumask_var(cs->effective_cpus);
2014 free_cpumask_var(cs->cpus_allowed);
2015 kfree(cs);
2016}
2017
2018static void cpuset_bind(struct cgroup_subsys_state *root_css)
2019{
2020 mutex_lock(&cpuset_mutex);
2021 spin_lock_irq(&callback_lock);
2022
2023 if (cgroup_on_dfl(root_css->cgroup)) {
2024 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
2025 top_cpuset.mems_allowed = node_possible_map;
2026 } else {
2027 cpumask_copy(top_cpuset.cpus_allowed,
2028 top_cpuset.effective_cpus);
2029 top_cpuset.mems_allowed = top_cpuset.effective_mems;
2030 }
2031
2032 spin_unlock_irq(&callback_lock);
2033 mutex_unlock(&cpuset_mutex);
2034}
2035
2036struct cgroup_subsys cpuset_cgrp_subsys = {
2037 .css_alloc = cpuset_css_alloc,
2038 .css_online = cpuset_css_online,
2039 .css_offline = cpuset_css_offline,
2040 .css_free = cpuset_css_free,
2041 .can_attach = cpuset_can_attach,
2042 .cancel_attach = cpuset_cancel_attach,
2043 .attach = cpuset_attach,
2044 .bind = cpuset_bind,
2045 .legacy_cftypes = files,
2046 .early_init = 1,
2047};
2048
2049
2050
2051
2052
2053
2054
2055int __init cpuset_init(void)
2056{
2057 int err = 0;
2058
2059 if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
2060 BUG();
2061 if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
2062 BUG();
2063
2064 cpumask_setall(top_cpuset.cpus_allowed);
2065 nodes_setall(top_cpuset.mems_allowed);
2066 cpumask_setall(top_cpuset.effective_cpus);
2067 nodes_setall(top_cpuset.effective_mems);
2068
2069 fmeter_init(&top_cpuset.fmeter);
2070 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
2071 top_cpuset.relax_domain_level = -1;
2072
2073 err = register_filesystem(&cpuset_fs_type);
2074 if (err < 0)
2075 return err;
2076
2077 if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
2078 BUG();
2079
2080 return 0;
2081}
2082
2083
2084
2085
2086
2087
2088
2089
2090static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2091{
2092 struct cpuset *parent;
2093
2094
2095
2096
2097
2098 parent = parent_cs(cs);
2099 while (cpumask_empty(parent->cpus_allowed) ||
2100 nodes_empty(parent->mems_allowed))
2101 parent = parent_cs(parent);
2102
2103 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
2104 pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
2105 pr_cont_cgroup_name(cs->css.cgroup);
2106 pr_cont("\n");
2107 }
2108}
2109
2110static void
2111hotplug_update_tasks_legacy(struct cpuset *cs,
2112 struct cpumask *new_cpus, nodemask_t *new_mems,
2113 bool cpus_updated, bool mems_updated)
2114{
2115 bool is_empty;
2116
2117 spin_lock_irq(&callback_lock);
2118 cpumask_copy(cs->cpus_allowed, new_cpus);
2119 cpumask_copy(cs->effective_cpus, new_cpus);
2120 cs->mems_allowed = *new_mems;
2121 cs->effective_mems = *new_mems;
2122 spin_unlock_irq(&callback_lock);
2123
2124
2125
2126
2127
2128 if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
2129 update_tasks_cpumask(cs);
2130 if (mems_updated && !nodes_empty(cs->mems_allowed))
2131 update_tasks_nodemask(cs);
2132
2133 is_empty = cpumask_empty(cs->cpus_allowed) ||
2134 nodes_empty(cs->mems_allowed);
2135
2136 mutex_unlock(&cpuset_mutex);
2137
2138
2139
2140
2141
2142
2143 if (is_empty)
2144 remove_tasks_in_empty_cpuset(cs);
2145
2146 mutex_lock(&cpuset_mutex);
2147}
2148
2149static void
2150hotplug_update_tasks(struct cpuset *cs,
2151 struct cpumask *new_cpus, nodemask_t *new_mems,
2152 bool cpus_updated, bool mems_updated)
2153{
2154 if (cpumask_empty(new_cpus))
2155 cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
2156 if (nodes_empty(*new_mems))
2157 *new_mems = parent_cs(cs)->effective_mems;
2158
2159 spin_lock_irq(&callback_lock);
2160 cpumask_copy(cs->effective_cpus, new_cpus);
2161 cs->effective_mems = *new_mems;
2162 spin_unlock_irq(&callback_lock);
2163
2164 if (cpus_updated)
2165 update_tasks_cpumask(cs);
2166 if (mems_updated)
2167 update_tasks_nodemask(cs);
2168}
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178static void cpuset_hotplug_update_tasks(struct cpuset *cs)
2179{
2180 static cpumask_t new_cpus;
2181 static nodemask_t new_mems;
2182 bool cpus_updated;
2183 bool mems_updated;
2184retry:
2185 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
2186
2187 mutex_lock(&cpuset_mutex);
2188
2189
2190
2191
2192
2193 if (cs->attach_in_progress) {
2194 mutex_unlock(&cpuset_mutex);
2195 goto retry;
2196 }
2197
2198 cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
2199 nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
2200
2201 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
2202 mems_updated = !nodes_equal(new_mems, cs->effective_mems);
2203
2204 if (cgroup_on_dfl(cs->css.cgroup))
2205 hotplug_update_tasks(cs, &new_cpus, &new_mems,
2206 cpus_updated, mems_updated);
2207 else
2208 hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
2209 cpus_updated, mems_updated);
2210
2211 mutex_unlock(&cpuset_mutex);
2212}
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230static void cpuset_hotplug_workfn(struct work_struct *work)
2231{
2232 static cpumask_t new_cpus;
2233 static nodemask_t new_mems;
2234 bool cpus_updated, mems_updated;
2235 bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
2236
2237 mutex_lock(&cpuset_mutex);
2238
2239
2240 cpumask_copy(&new_cpus, cpu_active_mask);
2241 new_mems = node_states[N_MEMORY];
2242
2243 cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
2244 mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
2245
2246
2247 if (cpus_updated) {
2248 spin_lock_irq(&callback_lock);
2249 if (!on_dfl)
2250 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
2251 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
2252 spin_unlock_irq(&callback_lock);
2253
2254 }
2255
2256
2257 if (mems_updated) {
2258 spin_lock_irq(&callback_lock);
2259 if (!on_dfl)
2260 top_cpuset.mems_allowed = new_mems;
2261 top_cpuset.effective_mems = new_mems;
2262 spin_unlock_irq(&callback_lock);
2263 update_tasks_nodemask(&top_cpuset);
2264 }
2265
2266 mutex_unlock(&cpuset_mutex);
2267
2268
2269 if (cpus_updated || mems_updated) {
2270 struct cpuset *cs;
2271 struct cgroup_subsys_state *pos_css;
2272
2273 rcu_read_lock();
2274 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
2275 if (cs == &top_cpuset || !css_tryget_online(&cs->css))
2276 continue;
2277 rcu_read_unlock();
2278
2279 cpuset_hotplug_update_tasks(cs);
2280
2281 rcu_read_lock();
2282 css_put(&cs->css);
2283 }
2284 rcu_read_unlock();
2285 }
2286
2287
2288 if (cpus_updated)
2289 rebuild_sched_domains();
2290}
2291
2292void cpuset_update_active_cpus(bool cpu_online)
2293{
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304 partition_sched_domains(1, NULL, NULL);
2305 schedule_work(&cpuset_hotplug_work);
2306}
2307
2308
2309
2310
2311
2312
2313static int cpuset_track_online_nodes(struct notifier_block *self,
2314 unsigned long action, void *arg)
2315{
2316 schedule_work(&cpuset_hotplug_work);
2317 return NOTIFY_OK;
2318}
2319
2320static struct notifier_block cpuset_track_online_nodes_nb = {
2321 .notifier_call = cpuset_track_online_nodes,
2322 .priority = 10,
2323};
2324
2325
2326
2327
2328
2329
2330void __init cpuset_init_smp(void)
2331{
2332 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2333 top_cpuset.mems_allowed = node_states[N_MEMORY];
2334 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
2335
2336 cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
2337 top_cpuset.effective_mems = node_states[N_MEMORY];
2338
2339 register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
2340}
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2354{
2355 unsigned long flags;
2356
2357 spin_lock_irqsave(&callback_lock, flags);
2358 rcu_read_lock();
2359 guarantee_online_cpus(task_cs(tsk), pmask);
2360 rcu_read_unlock();
2361 spin_unlock_irqrestore(&callback_lock, flags);
2362}
2363
2364void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2365{
2366 rcu_read_lock();
2367 do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
2368 rcu_read_unlock();
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387}
2388
2389void __init cpuset_init_current_mems_allowed(void)
2390{
2391 nodes_setall(current->mems_allowed);
2392}
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2405{
2406 nodemask_t mask;
2407 unsigned long flags;
2408
2409 spin_lock_irqsave(&callback_lock, flags);
2410 rcu_read_lock();
2411 guarantee_online_mems(task_cs(tsk), &mask);
2412 rcu_read_unlock();
2413 spin_unlock_irqrestore(&callback_lock, flags);
2414
2415 return mask;
2416}
2417
2418
2419
2420
2421
2422
2423
2424int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2425{
2426 return nodes_intersects(*nodemask, current->mems_allowed);
2427}
2428
2429
2430
2431
2432
2433
2434
2435static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2436{
2437 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
2438 cs = parent_cs(cs);
2439 return cs;
2440}
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490int __cpuset_node_allowed(int node, gfp_t gfp_mask)
2491{
2492 struct cpuset *cs;
2493 int allowed;
2494 unsigned long flags;
2495
2496 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2497 return 1;
2498 if (node_isset(node, current->mems_allowed))
2499 return 1;
2500
2501
2502
2503
2504 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2505 return 1;
2506 if (gfp_mask & __GFP_HARDWALL)
2507 return 0;
2508
2509 if (current->flags & PF_EXITING)
2510 return 1;
2511
2512
2513 spin_lock_irqsave(&callback_lock, flags);
2514
2515 rcu_read_lock();
2516 cs = nearest_hardwall_ancestor(task_cs(current));
2517 allowed = node_isset(node, cs->mems_allowed);
2518 rcu_read_unlock();
2519
2520 spin_unlock_irqrestore(&callback_lock, flags);
2521 return allowed;
2522}
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551static int cpuset_spread_node(int *rotor)
2552{
2553 int node;
2554
2555 node = next_node(*rotor, current->mems_allowed);
2556 if (node == MAX_NUMNODES)
2557 node = first_node(current->mems_allowed);
2558 *rotor = node;
2559 return node;
2560}
2561
2562int cpuset_mem_spread_node(void)
2563{
2564 if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
2565 current->cpuset_mem_spread_rotor =
2566 node_random(¤t->mems_allowed);
2567
2568 return cpuset_spread_node(¤t->cpuset_mem_spread_rotor);
2569}
2570
2571int cpuset_slab_spread_node(void)
2572{
2573 if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
2574 current->cpuset_slab_spread_rotor =
2575 node_random(¤t->mems_allowed);
2576
2577 return cpuset_spread_node(¤t->cpuset_slab_spread_rotor);
2578}
2579
2580EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2594 const struct task_struct *tsk2)
2595{
2596 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2597}
2598
2599
2600
2601
2602
2603
2604
2605
2606void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2607{
2608 struct cgroup *cgrp;
2609
2610 rcu_read_lock();
2611
2612 cgrp = task_cs(tsk)->css.cgroup;
2613 pr_info("%s cpuset=", tsk->comm);
2614 pr_cont_cgroup_name(cgrp);
2615 pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed));
2616
2617 rcu_read_unlock();
2618}
2619
2620
2621
2622
2623
2624
2625
2626int cpuset_memory_pressure_enabled __read_mostly;
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646void __cpuset_memory_pressure_bump(void)
2647{
2648 rcu_read_lock();
2649 fmeter_markevent(&task_cs(current)->fmeter);
2650 rcu_read_unlock();
2651}
2652
2653#ifdef CONFIG_PROC_PID_CPUSET
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
2664 struct pid *pid, struct task_struct *tsk)
2665{
2666 char *buf, *p;
2667 struct cgroup_subsys_state *css;
2668 int retval;
2669
2670 retval = -ENOMEM;
2671 buf = kmalloc(PATH_MAX, GFP_KERNEL);
2672 if (!buf)
2673 goto out;
2674
2675 retval = -ENAMETOOLONG;
2676 rcu_read_lock();
2677 css = task_css(tsk, cpuset_cgrp_id);
2678 p = cgroup_path(css->cgroup, buf, PATH_MAX);
2679 rcu_read_unlock();
2680 if (!p)
2681 goto out_free;
2682 seq_puts(m, p);
2683 seq_putc(m, '\n');
2684 retval = 0;
2685out_free:
2686 kfree(buf);
2687out:
2688 return retval;
2689}
2690#endif
2691
2692
2693void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2694{
2695 seq_printf(m, "Mems_allowed:\t%*pb\n",
2696 nodemask_pr_args(&task->mems_allowed));
2697 seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
2698 nodemask_pr_args(&task->mems_allowed));
2699}
2700