1
2
3
4
5#include "sched.h"
6
7DEFINE_MUTEX(sched_domains_mutex);
8
9
10static cpumask_var_t sched_domains_tmpmask;
11static cpumask_var_t sched_domains_tmpmask2;
12
13#ifdef CONFIG_SCHED_DEBUG
14
15static int __init sched_debug_setup(char *str)
16{
17 sched_debug_verbose = true;
18
19 return 0;
20}
21early_param("sched_verbose", sched_debug_setup);
22
23static inline bool sched_debug(void)
24{
25 return sched_debug_verbose;
26}
27
28#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
29const struct sd_flag_debug sd_flag_debug[] = {
30#include <linux/sched/sd_flags.h>
31};
32#undef SD_FLAG
33
34static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
35 struct cpumask *groupmask)
36{
37 struct sched_group *group = sd->groups;
38 unsigned long flags = sd->flags;
39 unsigned int idx;
40
41 cpumask_clear(groupmask);
42
43 printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
44 printk(KERN_CONT "span=%*pbl level=%s\n",
45 cpumask_pr_args(sched_domain_span(sd)), sd->name);
46
47 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
48 printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
49 }
50 if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) {
51 printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
52 }
53
54 for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
55 unsigned int flag = BIT(idx);
56 unsigned int meta_flags = sd_flag_debug[idx].meta_flags;
57
58 if ((meta_flags & SDF_SHARED_CHILD) && sd->child &&
59 !(sd->child->flags & flag))
60 printk(KERN_ERR "ERROR: flag %s set here but not in child\n",
61 sd_flag_debug[idx].name);
62
63 if ((meta_flags & SDF_SHARED_PARENT) && sd->parent &&
64 !(sd->parent->flags & flag))
65 printk(KERN_ERR "ERROR: flag %s set here but not in parent\n",
66 sd_flag_debug[idx].name);
67 }
68
69 printk(KERN_DEBUG "%*s groups:", level + 1, "");
70 do {
71 if (!group) {
72 printk("\n");
73 printk(KERN_ERR "ERROR: group is NULL\n");
74 break;
75 }
76
77 if (!cpumask_weight(sched_group_span(group))) {
78 printk(KERN_CONT "\n");
79 printk(KERN_ERR "ERROR: empty group\n");
80 break;
81 }
82
83 if (!(sd->flags & SD_OVERLAP) &&
84 cpumask_intersects(groupmask, sched_group_span(group))) {
85 printk(KERN_CONT "\n");
86 printk(KERN_ERR "ERROR: repeated CPUs\n");
87 break;
88 }
89
90 cpumask_or(groupmask, groupmask, sched_group_span(group));
91
92 printk(KERN_CONT " %d:{ span=%*pbl",
93 group->sgc->id,
94 cpumask_pr_args(sched_group_span(group)));
95
96 if ((sd->flags & SD_OVERLAP) &&
97 !cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
98 printk(KERN_CONT " mask=%*pbl",
99 cpumask_pr_args(group_balance_mask(group)));
100 }
101
102 if (group->sgc->capacity != SCHED_CAPACITY_SCALE)
103 printk(KERN_CONT " cap=%lu", group->sgc->capacity);
104
105 if (group == sd->groups && sd->child &&
106 !cpumask_equal(sched_domain_span(sd->child),
107 sched_group_span(group))) {
108 printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
109 }
110
111 printk(KERN_CONT " }");
112
113 group = group->next;
114
115 if (group != sd->groups)
116 printk(KERN_CONT ",");
117
118 } while (group != sd->groups);
119 printk(KERN_CONT "\n");
120
121 if (!cpumask_equal(sched_domain_span(sd), groupmask))
122 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
123
124 if (sd->parent &&
125 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
126 printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
127 return 0;
128}
129
130static void sched_domain_debug(struct sched_domain *sd, int cpu)
131{
132 int level = 0;
133
134 if (!sched_debug_verbose)
135 return;
136
137 if (!sd) {
138 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
139 return;
140 }
141
142 printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);
143
144 for (;;) {
145 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
146 break;
147 level++;
148 sd = sd->parent;
149 if (!sd)
150 break;
151 }
152}
153#else
154
155# define sched_debug_verbose 0
156# define sched_domain_debug(sd, cpu) do { } while (0)
157static inline bool sched_debug(void)
158{
159 return false;
160}
161#endif
162
163
164#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) |
165static const unsigned int SD_DEGENERATE_GROUPS_MASK =
166#include <linux/sched/sd_flags.h>
1670;
168#undef SD_FLAG
169
170static int sd_degenerate(struct sched_domain *sd)
171{
172 if (cpumask_weight(sched_domain_span(sd)) == 1)
173 return 1;
174
175
176 if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) &&
177 (sd->groups != sd->groups->next))
178 return 0;
179
180
181 if (sd->flags & (SD_WAKE_AFFINE))
182 return 0;
183
184 return 1;
185}
186
187static int
188sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
189{
190 unsigned long cflags = sd->flags, pflags = parent->flags;
191
192 if (sd_degenerate(parent))
193 return 1;
194
195 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
196 return 0;
197
198
199 if (parent->groups == parent->groups->next)
200 pflags &= ~SD_DEGENERATE_GROUPS_MASK;
201
202 if (~cflags & pflags)
203 return 0;
204
205 return 1;
206}
207
208#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
209DEFINE_STATIC_KEY_FALSE(sched_energy_present);
210unsigned int sysctl_sched_energy_aware = 1;
211DEFINE_MUTEX(sched_energy_mutex);
212bool sched_energy_update;
213
214void rebuild_sched_domains_energy(void)
215{
216 mutex_lock(&sched_energy_mutex);
217 sched_energy_update = true;
218 rebuild_sched_domains();
219 sched_energy_update = false;
220 mutex_unlock(&sched_energy_mutex);
221}
222
223#ifdef CONFIG_PROC_SYSCTL
224int sched_energy_aware_handler(struct ctl_table *table, int write,
225 void *buffer, size_t *lenp, loff_t *ppos)
226{
227 int ret, state;
228
229 if (write && !capable(CAP_SYS_ADMIN))
230 return -EPERM;
231
232 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
233 if (!ret && write) {
234 state = static_branch_unlikely(&sched_energy_present);
235 if (state != sysctl_sched_energy_aware)
236 rebuild_sched_domains_energy();
237 }
238
239 return ret;
240}
241#endif
242
243static void free_pd(struct perf_domain *pd)
244{
245 struct perf_domain *tmp;
246
247 while (pd) {
248 tmp = pd->next;
249 kfree(pd);
250 pd = tmp;
251 }
252}
253
254static struct perf_domain *find_pd(struct perf_domain *pd, int cpu)
255{
256 while (pd) {
257 if (cpumask_test_cpu(cpu, perf_domain_span(pd)))
258 return pd;
259 pd = pd->next;
260 }
261
262 return NULL;
263}
264
265static struct perf_domain *pd_init(int cpu)
266{
267 struct em_perf_domain *obj = em_cpu_get(cpu);
268 struct perf_domain *pd;
269
270 if (!obj) {
271 if (sched_debug())
272 pr_info("%s: no EM found for CPU%d\n", __func__, cpu);
273 return NULL;
274 }
275
276 pd = kzalloc(sizeof(*pd), GFP_KERNEL);
277 if (!pd)
278 return NULL;
279 pd->em_pd = obj;
280
281 return pd;
282}
283
284static void perf_domain_debug(const struct cpumask *cpu_map,
285 struct perf_domain *pd)
286{
287 if (!sched_debug() || !pd)
288 return;
289
290 printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
291
292 while (pd) {
293 printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }",
294 cpumask_first(perf_domain_span(pd)),
295 cpumask_pr_args(perf_domain_span(pd)),
296 em_pd_nr_perf_states(pd->em_pd));
297 pd = pd->next;
298 }
299
300 printk(KERN_CONT "\n");
301}
302
303static void destroy_perf_domain_rcu(struct rcu_head *rp)
304{
305 struct perf_domain *pd;
306
307 pd = container_of(rp, struct perf_domain, rcu);
308 free_pd(pd);
309}
310
311static void sched_energy_set(bool has_eas)
312{
313 if (!has_eas && static_branch_unlikely(&sched_energy_present)) {
314 if (sched_debug())
315 pr_info("%s: stopping EAS\n", __func__);
316 static_branch_disable_cpuslocked(&sched_energy_present);
317 } else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
318 if (sched_debug())
319 pr_info("%s: starting EAS\n", __func__);
320 static_branch_enable_cpuslocked(&sched_energy_present);
321 }
322}
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349#define EM_MAX_COMPLEXITY 2048
350
351extern struct cpufreq_governor schedutil_gov;
352static bool build_perf_domains(const struct cpumask *cpu_map)
353{
354 int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map);
355 struct perf_domain *pd = NULL, *tmp;
356 int cpu = cpumask_first(cpu_map);
357 struct root_domain *rd = cpu_rq(cpu)->rd;
358 struct cpufreq_policy *policy;
359 struct cpufreq_governor *gov;
360
361 if (!sysctl_sched_energy_aware)
362 goto free;
363
364
365 if (!per_cpu(sd_asym_cpucapacity, cpu)) {
366 if (sched_debug()) {
367 pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
368 cpumask_pr_args(cpu_map));
369 }
370 goto free;
371 }
372
373
374 if (sched_smt_active()) {
375 pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n",
376 cpumask_pr_args(cpu_map));
377 goto free;
378 }
379
380 if (!arch_scale_freq_invariant()) {
381 if (sched_debug()) {
382 pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported",
383 cpumask_pr_args(cpu_map));
384 }
385 goto free;
386 }
387
388 for_each_cpu(i, cpu_map) {
389
390 if (find_pd(pd, i))
391 continue;
392
393
394 policy = cpufreq_cpu_get(i);
395 if (!policy)
396 goto free;
397 gov = policy->governor;
398 cpufreq_cpu_put(policy);
399 if (gov != &schedutil_gov) {
400 if (rd->pd)
401 pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n",
402 cpumask_pr_args(cpu_map));
403 goto free;
404 }
405
406
407 tmp = pd_init(i);
408 if (!tmp)
409 goto free;
410 tmp->next = pd;
411 pd = tmp;
412
413
414
415
416
417 nr_pd++;
418 nr_ps += em_pd_nr_perf_states(pd->em_pd);
419 }
420
421
422 if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) {
423 WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
424 cpumask_pr_args(cpu_map));
425 goto free;
426 }
427
428 perf_domain_debug(cpu_map, pd);
429
430
431 tmp = rd->pd;
432 rcu_assign_pointer(rd->pd, pd);
433 if (tmp)
434 call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
435
436 return !!pd;
437
438free:
439 free_pd(pd);
440 tmp = rd->pd;
441 rcu_assign_pointer(rd->pd, NULL);
442 if (tmp)
443 call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
444
445 return false;
446}
447#else
448static void free_pd(struct perf_domain *pd) { }
449#endif
450
451static void free_rootdomain(struct rcu_head *rcu)
452{
453 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
454
455 cpupri_cleanup(&rd->cpupri);
456 cpudl_cleanup(&rd->cpudl);
457 free_cpumask_var(rd->dlo_mask);
458 free_cpumask_var(rd->rto_mask);
459 free_cpumask_var(rd->online);
460 free_cpumask_var(rd->span);
461 free_pd(rd->pd);
462 kfree(rd);
463}
464
465void rq_attach_root(struct rq *rq, struct root_domain *rd)
466{
467 struct root_domain *old_rd = NULL;
468 unsigned long flags;
469
470 raw_spin_rq_lock_irqsave(rq, flags);
471
472 if (rq->rd) {
473 old_rd = rq->rd;
474
475 if (cpumask_test_cpu(rq->cpu, old_rd->online))
476 set_rq_offline(rq);
477
478 cpumask_clear_cpu(rq->cpu, old_rd->span);
479
480
481
482
483
484
485 if (!atomic_dec_and_test(&old_rd->refcount))
486 old_rd = NULL;
487 }
488
489 atomic_inc(&rd->refcount);
490 rq->rd = rd;
491
492 cpumask_set_cpu(rq->cpu, rd->span);
493 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
494 set_rq_online(rq);
495
496 raw_spin_rq_unlock_irqrestore(rq, flags);
497
498 if (old_rd)
499 call_rcu(&old_rd->rcu, free_rootdomain);
500}
501
502void sched_get_rd(struct root_domain *rd)
503{
504 atomic_inc(&rd->refcount);
505}
506
507void sched_put_rd(struct root_domain *rd)
508{
509 if (!atomic_dec_and_test(&rd->refcount))
510 return;
511
512 call_rcu(&rd->rcu, free_rootdomain);
513}
514
515static int init_rootdomain(struct root_domain *rd)
516{
517 if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
518 goto out;
519 if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
520 goto free_span;
521 if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
522 goto free_online;
523 if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
524 goto free_dlo_mask;
525
526#ifdef HAVE_RT_PUSH_IPI
527 rd->rto_cpu = -1;
528 raw_spin_lock_init(&rd->rto_lock);
529 init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
530#endif
531
532 rd->visit_gen = 0;
533 init_dl_bw(&rd->dl_bw);
534 if (cpudl_init(&rd->cpudl) != 0)
535 goto free_rto_mask;
536
537 if (cpupri_init(&rd->cpupri) != 0)
538 goto free_cpudl;
539 return 0;
540
541free_cpudl:
542 cpudl_cleanup(&rd->cpudl);
543free_rto_mask:
544 free_cpumask_var(rd->rto_mask);
545free_dlo_mask:
546 free_cpumask_var(rd->dlo_mask);
547free_online:
548 free_cpumask_var(rd->online);
549free_span:
550 free_cpumask_var(rd->span);
551out:
552 return -ENOMEM;
553}
554
555
556
557
558
559struct root_domain def_root_domain;
560
561void init_defrootdomain(void)
562{
563 init_rootdomain(&def_root_domain);
564
565 atomic_set(&def_root_domain.refcount, 1);
566}
567
568static struct root_domain *alloc_rootdomain(void)
569{
570 struct root_domain *rd;
571
572 rd = kzalloc(sizeof(*rd), GFP_KERNEL);
573 if (!rd)
574 return NULL;
575
576 if (init_rootdomain(rd) != 0) {
577 kfree(rd);
578 return NULL;
579 }
580
581 return rd;
582}
583
584static void free_sched_groups(struct sched_group *sg, int free_sgc)
585{
586 struct sched_group *tmp, *first;
587
588 if (!sg)
589 return;
590
591 first = sg;
592 do {
593 tmp = sg->next;
594
595 if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
596 kfree(sg->sgc);
597
598 if (atomic_dec_and_test(&sg->ref))
599 kfree(sg);
600 sg = tmp;
601 } while (sg != first);
602}
603
604static void destroy_sched_domain(struct sched_domain *sd)
605{
606
607
608
609
610
611 free_sched_groups(sd->groups, 1);
612
613 if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
614 kfree(sd->shared);
615 kfree(sd);
616}
617
618static void destroy_sched_domains_rcu(struct rcu_head *rcu)
619{
620 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
621
622 while (sd) {
623 struct sched_domain *parent = sd->parent;
624 destroy_sched_domain(sd);
625 sd = parent;
626 }
627}
628
629static void destroy_sched_domains(struct sched_domain *sd)
630{
631 if (sd)
632 call_rcu(&sd->rcu, destroy_sched_domains_rcu);
633}
634
635
636
637
638
639
640
641
642
643
644DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
645DEFINE_PER_CPU(int, sd_llc_size);
646DEFINE_PER_CPU(int, sd_llc_id);
647DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
648DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
649DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
650DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
651DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
652
653static void update_top_cache_domain(int cpu)
654{
655 struct sched_domain_shared *sds = NULL;
656 struct sched_domain *sd;
657 int id = cpu;
658 int size = 1;
659
660 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
661 if (sd) {
662 id = cpumask_first(sched_domain_span(sd));
663 size = cpumask_weight(sched_domain_span(sd));
664 sds = sd->shared;
665 }
666
667 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
668 per_cpu(sd_llc_size, cpu) = size;
669 per_cpu(sd_llc_id, cpu) = id;
670 rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
671
672 sd = lowest_flag_domain(cpu, SD_NUMA);
673 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
674
675 sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
676 rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
677
678 sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
679 rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
680}
681
682
683
684
685
686static void
687cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
688{
689 struct rq *rq = cpu_rq(cpu);
690 struct sched_domain *tmp;
691 int numa_distance = 0;
692
693
694 for (tmp = sd; tmp; ) {
695 struct sched_domain *parent = tmp->parent;
696 if (!parent)
697 break;
698
699 if (sd_parent_degenerate(tmp, parent)) {
700 tmp->parent = parent->parent;
701 if (parent->parent)
702 parent->parent->child = tmp;
703
704
705
706
707
708 if (parent->flags & SD_PREFER_SIBLING)
709 tmp->flags |= SD_PREFER_SIBLING;
710 destroy_sched_domain(parent);
711 } else
712 tmp = tmp->parent;
713 }
714
715 if (sd && sd_degenerate(sd)) {
716 tmp = sd;
717 sd = sd->parent;
718 destroy_sched_domain(tmp);
719 if (sd)
720 sd->child = NULL;
721 }
722
723 for (tmp = sd; tmp; tmp = tmp->parent)
724 numa_distance += !!(tmp->flags & SD_NUMA);
725
726 sched_domain_debug(sd, cpu);
727
728 rq_attach_root(rq, rd);
729 tmp = rq->sd;
730 rcu_assign_pointer(rq->sd, sd);
731 dirty_sched_domain_sysctl(cpu);
732 destroy_sched_domains(tmp);
733
734 update_top_cache_domain(cpu);
735}
736
737struct s_data {
738 struct sched_domain * __percpu *sd;
739 struct root_domain *rd;
740};
741
742enum s_alloc {
743 sa_rootdomain,
744 sa_sd,
745 sa_sd_storage,
746 sa_none,
747};
748
749
750
751
752
753
754
755
756
757
758int group_balance_cpu(struct sched_group *sg)
759{
760 return cpumask_first(group_balance_mask(sg));
761}
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869static void
870build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
871{
872 const struct cpumask *sg_span = sched_group_span(sg);
873 struct sd_data *sdd = sd->private;
874 struct sched_domain *sibling;
875 int i;
876
877 cpumask_clear(mask);
878
879 for_each_cpu(i, sg_span) {
880 sibling = *per_cpu_ptr(sdd->sd, i);
881
882
883
884
885
886
887 if (!sibling->child)
888 continue;
889
890
891 if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
892 continue;
893
894 cpumask_set_cpu(i, mask);
895 }
896
897
898 WARN_ON_ONCE(cpumask_empty(mask));
899}
900
901
902
903
904
905
906static struct sched_group *
907build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
908{
909 struct sched_group *sg;
910 struct cpumask *sg_span;
911
912 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
913 GFP_KERNEL, cpu_to_node(cpu));
914
915 if (!sg)
916 return NULL;
917
918 sg_span = sched_group_span(sg);
919 if (sd->child)
920 cpumask_copy(sg_span, sched_domain_span(sd->child));
921 else
922 cpumask_copy(sg_span, sched_domain_span(sd));
923
924 atomic_inc(&sg->ref);
925 return sg;
926}
927
928static void init_overlap_sched_group(struct sched_domain *sd,
929 struct sched_group *sg)
930{
931 struct cpumask *mask = sched_domains_tmpmask2;
932 struct sd_data *sdd = sd->private;
933 struct cpumask *sg_span;
934 int cpu;
935
936 build_balance_mask(sd, sg, mask);
937 cpu = cpumask_first(mask);
938
939 sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
940 if (atomic_inc_return(&sg->sgc->ref) == 1)
941 cpumask_copy(group_balance_mask(sg), mask);
942 else
943 WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask));
944
945
946
947
948
949
950 sg_span = sched_group_span(sg);
951 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
952 sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
953 sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
954}
955
956static struct sched_domain *
957find_descended_sibling(struct sched_domain *sd, struct sched_domain *sibling)
958{
959
960
961
962
963 while (sibling->child &&
964 !cpumask_subset(sched_domain_span(sibling->child),
965 sched_domain_span(sd)))
966 sibling = sibling->child;
967
968
969
970
971
972
973 while (sibling->child &&
974 cpumask_equal(sched_domain_span(sibling->child),
975 sched_domain_span(sibling)))
976 sibling = sibling->child;
977
978 return sibling;
979}
980
981static int
982build_overlap_sched_groups(struct sched_domain *sd, int cpu)
983{
984 struct sched_group *first = NULL, *last = NULL, *sg;
985 const struct cpumask *span = sched_domain_span(sd);
986 struct cpumask *covered = sched_domains_tmpmask;
987 struct sd_data *sdd = sd->private;
988 struct sched_domain *sibling;
989 int i;
990
991 cpumask_clear(covered);
992
993 for_each_cpu_wrap(i, span, cpu) {
994 struct cpumask *sg_span;
995
996 if (cpumask_test_cpu(i, covered))
997 continue;
998
999 sibling = *per_cpu_ptr(sdd->sd, i);
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
1012 continue;
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045 if (sibling->child &&
1046 !cpumask_subset(sched_domain_span(sibling->child), span))
1047 sibling = find_descended_sibling(sd, sibling);
1048
1049 sg = build_group_from_child_sched_domain(sibling, cpu);
1050 if (!sg)
1051 goto fail;
1052
1053 sg_span = sched_group_span(sg);
1054 cpumask_or(covered, covered, sg_span);
1055
1056 init_overlap_sched_group(sibling, sg);
1057
1058 if (!first)
1059 first = sg;
1060 if (last)
1061 last->next = sg;
1062 last = sg;
1063 last->next = first;
1064 }
1065 sd->groups = first;
1066
1067 return 0;
1068
1069fail:
1070 free_sched_groups(first, 0);
1071
1072 return -ENOMEM;
1073}
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147static struct sched_group *get_group(int cpu, struct sd_data *sdd)
1148{
1149 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
1150 struct sched_domain *child = sd->child;
1151 struct sched_group *sg;
1152 bool already_visited;
1153
1154 if (child)
1155 cpu = cpumask_first(sched_domain_span(child));
1156
1157 sg = *per_cpu_ptr(sdd->sg, cpu);
1158 sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
1159
1160
1161 already_visited = atomic_inc_return(&sg->ref) > 1;
1162
1163 WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));
1164
1165
1166 if (already_visited)
1167 return sg;
1168
1169 if (child) {
1170 cpumask_copy(sched_group_span(sg), sched_domain_span(child));
1171 cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
1172 } else {
1173 cpumask_set_cpu(cpu, sched_group_span(sg));
1174 cpumask_set_cpu(cpu, group_balance_mask(sg));
1175 }
1176
1177 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
1178 sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
1179 sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
1180
1181 return sg;
1182}
1183
1184
1185
1186
1187
1188
1189
1190
1191static int
1192build_sched_groups(struct sched_domain *sd, int cpu)
1193{
1194 struct sched_group *first = NULL, *last = NULL;
1195 struct sd_data *sdd = sd->private;
1196 const struct cpumask *span = sched_domain_span(sd);
1197 struct cpumask *covered;
1198 int i;
1199
1200 lockdep_assert_held(&sched_domains_mutex);
1201 covered = sched_domains_tmpmask;
1202
1203 cpumask_clear(covered);
1204
1205 for_each_cpu_wrap(i, span, cpu) {
1206 struct sched_group *sg;
1207
1208 if (cpumask_test_cpu(i, covered))
1209 continue;
1210
1211 sg = get_group(i, sdd);
1212
1213 cpumask_or(covered, covered, sched_group_span(sg));
1214
1215 if (!first)
1216 first = sg;
1217 if (last)
1218 last->next = sg;
1219 last = sg;
1220 }
1221 last->next = first;
1222 sd->groups = first;
1223
1224 return 0;
1225}
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
1238{
1239 struct sched_group *sg = sd->groups;
1240
1241 WARN_ON(!sg);
1242
1243 do {
1244 int cpu, max_cpu = -1;
1245
1246 sg->group_weight = cpumask_weight(sched_group_span(sg));
1247
1248 if (!(sd->flags & SD_ASYM_PACKING))
1249 goto next;
1250
1251 for_each_cpu(cpu, sched_group_span(sg)) {
1252 if (max_cpu < 0)
1253 max_cpu = cpu;
1254 else if (sched_asym_prefer(cpu, max_cpu))
1255 max_cpu = cpu;
1256 }
1257 sg->asym_prefer_cpu = max_cpu;
1258
1259next:
1260 sg = sg->next;
1261 } while (sg != sd->groups);
1262
1263 if (cpu != group_balance_cpu(sg))
1264 return;
1265
1266 update_group_capacity(sd, cpu);
1267}
1268
1269
1270
1271
1272struct asym_cap_data {
1273 struct list_head link;
1274 unsigned long capacity;
1275 unsigned long cpus[];
1276};
1277
1278
1279
1280
1281
1282
1283
1284static LIST_HEAD(asym_cap_list);
1285
1286#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus)
1287
1288
1289
1290
1291
1292static inline int
1293asym_cpu_capacity_classify(const struct cpumask *sd_span,
1294 const struct cpumask *cpu_map)
1295{
1296 struct asym_cap_data *entry;
1297 int count = 0, miss = 0;
1298
1299
1300
1301
1302
1303
1304
1305 list_for_each_entry(entry, &asym_cap_list, link) {
1306 if (cpumask_intersects(sd_span, cpu_capacity_span(entry)))
1307 ++count;
1308 else if (cpumask_intersects(cpu_map, cpu_capacity_span(entry)))
1309 ++miss;
1310 }
1311
1312 WARN_ON_ONCE(!count && !list_empty(&asym_cap_list));
1313
1314
1315 if (count < 2)
1316 return 0;
1317
1318 if (miss)
1319 return SD_ASYM_CPUCAPACITY;
1320
1321
1322 return SD_ASYM_CPUCAPACITY | SD_ASYM_CPUCAPACITY_FULL;
1323
1324}
1325
1326static inline void asym_cpu_capacity_update_data(int cpu)
1327{
1328 unsigned long capacity = arch_scale_cpu_capacity(cpu);
1329 struct asym_cap_data *entry = NULL;
1330
1331 list_for_each_entry(entry, &asym_cap_list, link) {
1332 if (capacity == entry->capacity)
1333 goto done;
1334 }
1335
1336 entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL);
1337 if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n"))
1338 return;
1339 entry->capacity = capacity;
1340 list_add(&entry->link, &asym_cap_list);
1341done:
1342 __cpumask_set_cpu(cpu, cpu_capacity_span(entry));
1343}
1344
1345
1346
1347
1348
1349
1350static void asym_cpu_capacity_scan(void)
1351{
1352 struct asym_cap_data *entry, *next;
1353 int cpu;
1354
1355 list_for_each_entry(entry, &asym_cap_list, link)
1356 cpumask_clear(cpu_capacity_span(entry));
1357
1358 for_each_cpu_and(cpu, cpu_possible_mask, housekeeping_cpumask(HK_FLAG_DOMAIN))
1359 asym_cpu_capacity_update_data(cpu);
1360
1361 list_for_each_entry_safe(entry, next, &asym_cap_list, link) {
1362 if (cpumask_empty(cpu_capacity_span(entry))) {
1363 list_del(&entry->link);
1364 kfree(entry);
1365 }
1366 }
1367
1368
1369
1370
1371
1372 if (list_is_singular(&asym_cap_list)) {
1373 entry = list_first_entry(&asym_cap_list, typeof(*entry), link);
1374 list_del(&entry->link);
1375 kfree(entry);
1376 }
1377}
1378
1379
1380
1381
1382
1383
1384static int default_relax_domain_level = -1;
1385int sched_domain_level_max;
1386
1387static int __init setup_relax_domain_level(char *str)
1388{
1389 if (kstrtoint(str, 0, &default_relax_domain_level))
1390 pr_warn("Unable to set relax_domain_level\n");
1391
1392 return 1;
1393}
1394__setup("relax_domain_level=", setup_relax_domain_level);
1395
1396static void set_domain_attribute(struct sched_domain *sd,
1397 struct sched_domain_attr *attr)
1398{
1399 int request;
1400
1401 if (!attr || attr->relax_domain_level < 0) {
1402 if (default_relax_domain_level < 0)
1403 return;
1404 request = default_relax_domain_level;
1405 } else
1406 request = attr->relax_domain_level;
1407
1408 if (sd->level > request) {
1409
1410 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
1411 }
1412}
1413
1414static void __sdt_free(const struct cpumask *cpu_map);
1415static int __sdt_alloc(const struct cpumask *cpu_map);
1416
1417static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
1418 const struct cpumask *cpu_map)
1419{
1420 switch (what) {
1421 case sa_rootdomain:
1422 if (!atomic_read(&d->rd->refcount))
1423 free_rootdomain(&d->rd->rcu);
1424 fallthrough;
1425 case sa_sd:
1426 free_percpu(d->sd);
1427 fallthrough;
1428 case sa_sd_storage:
1429 __sdt_free(cpu_map);
1430 fallthrough;
1431 case sa_none:
1432 break;
1433 }
1434}
1435
1436static enum s_alloc
1437__visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
1438{
1439 memset(d, 0, sizeof(*d));
1440
1441 if (__sdt_alloc(cpu_map))
1442 return sa_sd_storage;
1443 d->sd = alloc_percpu(struct sched_domain *);
1444 if (!d->sd)
1445 return sa_sd_storage;
1446 d->rd = alloc_rootdomain();
1447 if (!d->rd)
1448 return sa_sd;
1449
1450 return sa_rootdomain;
1451}
1452
1453
1454
1455
1456
1457
1458static void claim_allocations(int cpu, struct sched_domain *sd)
1459{
1460 struct sd_data *sdd = sd->private;
1461
1462 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
1463 *per_cpu_ptr(sdd->sd, cpu) = NULL;
1464
1465 if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
1466 *per_cpu_ptr(sdd->sds, cpu) = NULL;
1467
1468 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
1469 *per_cpu_ptr(sdd->sg, cpu) = NULL;
1470
1471 if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
1472 *per_cpu_ptr(sdd->sgc, cpu) = NULL;
1473}
1474
1475#ifdef CONFIG_NUMA
1476enum numa_topology_type sched_numa_topology_type;
1477
1478static int sched_domains_numa_levels;
1479static int sched_domains_curr_level;
1480
1481int sched_max_numa_distance;
1482static int *sched_domains_numa_distance;
1483static struct cpumask ***sched_domains_numa_masks;
1484int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
1485
1486static unsigned long __read_mostly *sched_numa_onlined_nodes;
1487#endif
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505#define TOPOLOGY_SD_FLAGS \
1506 (SD_SHARE_CPUCAPACITY | \
1507 SD_SHARE_PKG_RESOURCES | \
1508 SD_NUMA | \
1509 SD_ASYM_PACKING)
1510
1511static struct sched_domain *
1512sd_init(struct sched_domain_topology_level *tl,
1513 const struct cpumask *cpu_map,
1514 struct sched_domain *child, int cpu)
1515{
1516 struct sd_data *sdd = &tl->data;
1517 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
1518 int sd_id, sd_weight, sd_flags = 0;
1519 struct cpumask *sd_span;
1520
1521#ifdef CONFIG_NUMA
1522
1523
1524
1525 sched_domains_curr_level = tl->numa_level;
1526#endif
1527
1528 sd_weight = cpumask_weight(tl->mask(cpu));
1529
1530 if (tl->sd_flags)
1531 sd_flags = (*tl->sd_flags)();
1532 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
1533 "wrong sd_flags in topology description\n"))
1534 sd_flags &= TOPOLOGY_SD_FLAGS;
1535
1536 *sd = (struct sched_domain){
1537 .min_interval = sd_weight,
1538 .max_interval = 2*sd_weight,
1539 .busy_factor = 16,
1540 .imbalance_pct = 117,
1541
1542 .cache_nice_tries = 0,
1543
1544 .flags = 1*SD_BALANCE_NEWIDLE
1545 | 1*SD_BALANCE_EXEC
1546 | 1*SD_BALANCE_FORK
1547 | 0*SD_BALANCE_WAKE
1548 | 1*SD_WAKE_AFFINE
1549 | 0*SD_SHARE_CPUCAPACITY
1550 | 0*SD_SHARE_PKG_RESOURCES
1551 | 0*SD_SERIALIZE
1552 | 1*SD_PREFER_SIBLING
1553 | 0*SD_NUMA
1554 | sd_flags
1555 ,
1556
1557 .last_balance = jiffies,
1558 .balance_interval = sd_weight,
1559 .max_newidle_lb_cost = 0,
1560 .next_decay_max_lb_cost = jiffies,
1561 .child = child,
1562#ifdef CONFIG_SCHED_DEBUG
1563 .name = tl->name,
1564#endif
1565 };
1566
1567 sd_span = sched_domain_span(sd);
1568 cpumask_and(sd_span, cpu_map, tl->mask(cpu));
1569 sd_id = cpumask_first(sd_span);
1570
1571 sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map);
1572
1573 WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY)) ==
1574 (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY),
1575 "CPU capacity asymmetry not supported on SMT\n");
1576
1577
1578
1579
1580
1581 if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child)
1582 sd->child->flags &= ~SD_PREFER_SIBLING;
1583
1584 if (sd->flags & SD_SHARE_CPUCAPACITY) {
1585 sd->imbalance_pct = 110;
1586
1587 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
1588 sd->imbalance_pct = 117;
1589 sd->cache_nice_tries = 1;
1590
1591#ifdef CONFIG_NUMA
1592 } else if (sd->flags & SD_NUMA) {
1593 sd->cache_nice_tries = 2;
1594
1595 sd->flags &= ~SD_PREFER_SIBLING;
1596 sd->flags |= SD_SERIALIZE;
1597 if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
1598 sd->flags &= ~(SD_BALANCE_EXEC |
1599 SD_BALANCE_FORK |
1600 SD_WAKE_AFFINE);
1601 }
1602
1603#endif
1604 } else {
1605 sd->cache_nice_tries = 1;
1606 }
1607
1608
1609
1610
1611
1612 if (sd->flags & SD_SHARE_PKG_RESOURCES) {
1613 sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
1614 atomic_inc(&sd->shared->ref);
1615 atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
1616 }
1617
1618 sd->private = sdd;
1619
1620 return sd;
1621}
1622
1623
1624
1625
1626static struct sched_domain_topology_level default_topology[] = {
1627#ifdef CONFIG_SCHED_SMT
1628 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
1629#endif
1630#ifdef CONFIG_SCHED_MC
1631 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
1632#endif
1633 { cpu_cpu_mask, SD_INIT_NAME(DIE) },
1634 { NULL, },
1635};
1636
1637static struct sched_domain_topology_level *sched_domain_topology =
1638 default_topology;
1639
1640#define for_each_sd_topology(tl) \
1641 for (tl = sched_domain_topology; tl->mask; tl++)
1642
1643void set_sched_topology(struct sched_domain_topology_level *tl)
1644{
1645 if (WARN_ON_ONCE(sched_smp_initialized))
1646 return;
1647
1648 sched_domain_topology = tl;
1649}
1650
1651#ifdef CONFIG_NUMA
1652
1653static const struct cpumask *sd_numa_mask(int cpu)
1654{
1655 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
1656}
1657
1658static void sched_numa_warn(const char *str)
1659{
1660 static int done = false;
1661 int i,j;
1662
1663 if (done)
1664 return;
1665
1666 done = true;
1667
1668 printk(KERN_WARNING "ERROR: %s\n\n", str);
1669
1670 for (i = 0; i < nr_node_ids; i++) {
1671 printk(KERN_WARNING " ");
1672 for (j = 0; j < nr_node_ids; j++)
1673 printk(KERN_CONT "%02d ", node_distance(i,j));
1674 printk(KERN_CONT "\n");
1675 }
1676 printk(KERN_WARNING "\n");
1677}
1678
1679bool find_numa_distance(int distance)
1680{
1681 int i;
1682
1683 if (distance == node_distance(0, 0))
1684 return true;
1685
1686 for (i = 0; i < sched_domains_numa_levels; i++) {
1687 if (sched_domains_numa_distance[i] == distance)
1688 return true;
1689 }
1690
1691 return false;
1692}
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713static void init_numa_topology_type(void)
1714{
1715 int a, b, c, n;
1716
1717 n = sched_max_numa_distance;
1718
1719 if (sched_domains_numa_levels <= 2) {
1720 sched_numa_topology_type = NUMA_DIRECT;
1721 return;
1722 }
1723
1724 for_each_online_node(a) {
1725 for_each_online_node(b) {
1726
1727 if (node_distance(a, b) < n)
1728 continue;
1729
1730
1731 for_each_online_node(c) {
1732 if (node_distance(a, c) < n &&
1733 node_distance(b, c) < n) {
1734 sched_numa_topology_type =
1735 NUMA_GLUELESS_MESH;
1736 return;
1737 }
1738 }
1739
1740 sched_numa_topology_type = NUMA_BACKPLANE;
1741 return;
1742 }
1743 }
1744}
1745
1746
1747#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
1748
1749void sched_init_numa(void)
1750{
1751 struct sched_domain_topology_level *tl;
1752 unsigned long *distance_map;
1753 int nr_levels = 0;
1754 int i, j;
1755
1756
1757
1758
1759
1760 distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
1761 if (!distance_map)
1762 return;
1763
1764 bitmap_zero(distance_map, NR_DISTANCE_VALUES);
1765 for (i = 0; i < nr_node_ids; i++) {
1766 for (j = 0; j < nr_node_ids; j++) {
1767 int distance = node_distance(i, j);
1768
1769 if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
1770 sched_numa_warn("Invalid distance value range");
1771 return;
1772 }
1773
1774 bitmap_set(distance_map, distance, 1);
1775 }
1776 }
1777
1778
1779
1780
1781 nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
1782
1783 sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
1784 if (!sched_domains_numa_distance) {
1785 bitmap_free(distance_map);
1786 return;
1787 }
1788
1789 for (i = 0, j = 0; i < nr_levels; i++, j++) {
1790 j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
1791 sched_domains_numa_distance[i] = j;
1792 }
1793
1794 bitmap_free(distance_map);
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812 sched_domains_numa_levels = 0;
1813
1814 sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
1815 if (!sched_domains_numa_masks)
1816 return;
1817
1818
1819
1820
1821
1822 for (i = 0; i < nr_levels; i++) {
1823 sched_domains_numa_masks[i] =
1824 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
1825 if (!sched_domains_numa_masks[i])
1826 return;
1827
1828 for (j = 0; j < nr_node_ids; j++) {
1829 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
1830 int k;
1831
1832 if (!mask)
1833 return;
1834
1835 sched_domains_numa_masks[i][j] = mask;
1836
1837 for_each_node(k) {
1838
1839
1840
1841
1842
1843
1844
1845 if (!node_online(j))
1846 continue;
1847
1848 if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
1849 sched_numa_warn("Node-distance not symmetric");
1850
1851 if (node_distance(j, k) > sched_domains_numa_distance[i])
1852 continue;
1853
1854 cpumask_or(mask, mask, cpumask_of_node(k));
1855 }
1856 }
1857 }
1858
1859
1860 for (i = 0; sched_domain_topology[i].mask; i++);
1861
1862 tl = kzalloc((i + nr_levels + 1) *
1863 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
1864 if (!tl)
1865 return;
1866
1867
1868
1869
1870 for (i = 0; sched_domain_topology[i].mask; i++)
1871 tl[i] = sched_domain_topology[i];
1872
1873
1874
1875
1876 tl[i++] = (struct sched_domain_topology_level){
1877 .mask = sd_numa_mask,
1878 .numa_level = 0,
1879 SD_INIT_NAME(NODE)
1880 };
1881
1882
1883
1884
1885 for (j = 1; j < nr_levels; i++, j++) {
1886 tl[i] = (struct sched_domain_topology_level){
1887 .mask = sd_numa_mask,
1888 .sd_flags = cpu_numa_flags,
1889 .flags = SDTL_OVERLAP,
1890 .numa_level = j,
1891 SD_INIT_NAME(NUMA)
1892 };
1893 }
1894
1895 sched_domain_topology = tl;
1896
1897 sched_domains_numa_levels = nr_levels;
1898 sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
1899
1900 init_numa_topology_type();
1901
1902 sched_numa_onlined_nodes = bitmap_alloc(nr_node_ids, GFP_KERNEL);
1903 if (!sched_numa_onlined_nodes)
1904 return;
1905
1906 bitmap_zero(sched_numa_onlined_nodes, nr_node_ids);
1907 for_each_online_node(i)
1908 bitmap_set(sched_numa_onlined_nodes, i, 1);
1909}
1910
1911static void __sched_domains_numa_masks_set(unsigned int node)
1912{
1913 int i, j;
1914
1915
1916
1917
1918
1919
1920
1921 if (test_bit(node, sched_numa_onlined_nodes))
1922 return;
1923
1924 bitmap_set(sched_numa_onlined_nodes, node, 1);
1925
1926 for (i = 0; i < sched_domains_numa_levels; i++) {
1927 for (j = 0; j < nr_node_ids; j++) {
1928 if (!node_online(j) || node == j)
1929 continue;
1930
1931 if (node_distance(j, node) > sched_domains_numa_distance[i])
1932 continue;
1933
1934
1935 cpumask_or(sched_domains_numa_masks[i][node],
1936 sched_domains_numa_masks[i][node],
1937 sched_domains_numa_masks[0][j]);
1938 }
1939 }
1940
1941
1942
1943
1944
1945
1946
1947 init_numa_topology_type();
1948}
1949
1950void sched_domains_numa_masks_set(unsigned int cpu)
1951{
1952 int node = cpu_to_node(cpu);
1953 int i, j;
1954
1955 __sched_domains_numa_masks_set(node);
1956
1957 for (i = 0; i < sched_domains_numa_levels; i++) {
1958 for (j = 0; j < nr_node_ids; j++) {
1959 if (!node_online(j))
1960 continue;
1961
1962
1963 if (node_distance(j, node) <= sched_domains_numa_distance[i])
1964 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
1965 }
1966 }
1967}
1968
1969void sched_domains_numa_masks_clear(unsigned int cpu)
1970{
1971 int i, j;
1972
1973 for (i = 0; i < sched_domains_numa_levels; i++) {
1974 for (j = 0; j < nr_node_ids; j++)
1975 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
1976 }
1977}
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
1988{
1989 int i, j = cpu_to_node(cpu);
1990
1991 for (i = 0; i < sched_domains_numa_levels; i++) {
1992 cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
1993 if (cpu < nr_cpu_ids)
1994 return cpu;
1995 }
1996 return nr_cpu_ids;
1997}
1998
1999#endif
2000
2001static int __sdt_alloc(const struct cpumask *cpu_map)
2002{
2003 struct sched_domain_topology_level *tl;
2004 int j;
2005
2006 for_each_sd_topology(tl) {
2007 struct sd_data *sdd = &tl->data;
2008
2009 sdd->sd = alloc_percpu(struct sched_domain *);
2010 if (!sdd->sd)
2011 return -ENOMEM;
2012
2013 sdd->sds = alloc_percpu(struct sched_domain_shared *);
2014 if (!sdd->sds)
2015 return -ENOMEM;
2016
2017 sdd->sg = alloc_percpu(struct sched_group *);
2018 if (!sdd->sg)
2019 return -ENOMEM;
2020
2021 sdd->sgc = alloc_percpu(struct sched_group_capacity *);
2022 if (!sdd->sgc)
2023 return -ENOMEM;
2024
2025 for_each_cpu(j, cpu_map) {
2026 struct sched_domain *sd;
2027 struct sched_domain_shared *sds;
2028 struct sched_group *sg;
2029 struct sched_group_capacity *sgc;
2030
2031 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
2032 GFP_KERNEL, cpu_to_node(j));
2033 if (!sd)
2034 return -ENOMEM;
2035
2036 *per_cpu_ptr(sdd->sd, j) = sd;
2037
2038 sds = kzalloc_node(sizeof(struct sched_domain_shared),
2039 GFP_KERNEL, cpu_to_node(j));
2040 if (!sds)
2041 return -ENOMEM;
2042
2043 *per_cpu_ptr(sdd->sds, j) = sds;
2044
2045 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
2046 GFP_KERNEL, cpu_to_node(j));
2047 if (!sg)
2048 return -ENOMEM;
2049
2050 sg->next = sg;
2051
2052 *per_cpu_ptr(sdd->sg, j) = sg;
2053
2054 sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
2055 GFP_KERNEL, cpu_to_node(j));
2056 if (!sgc)
2057 return -ENOMEM;
2058
2059#ifdef CONFIG_SCHED_DEBUG
2060 sgc->id = j;
2061#endif
2062
2063 *per_cpu_ptr(sdd->sgc, j) = sgc;
2064 }
2065 }
2066
2067 return 0;
2068}
2069
2070static void __sdt_free(const struct cpumask *cpu_map)
2071{
2072 struct sched_domain_topology_level *tl;
2073 int j;
2074
2075 for_each_sd_topology(tl) {
2076 struct sd_data *sdd = &tl->data;
2077
2078 for_each_cpu(j, cpu_map) {
2079 struct sched_domain *sd;
2080
2081 if (sdd->sd) {
2082 sd = *per_cpu_ptr(sdd->sd, j);
2083 if (sd && (sd->flags & SD_OVERLAP))
2084 free_sched_groups(sd->groups, 0);
2085 kfree(*per_cpu_ptr(sdd->sd, j));
2086 }
2087
2088 if (sdd->sds)
2089 kfree(*per_cpu_ptr(sdd->sds, j));
2090 if (sdd->sg)
2091 kfree(*per_cpu_ptr(sdd->sg, j));
2092 if (sdd->sgc)
2093 kfree(*per_cpu_ptr(sdd->sgc, j));
2094 }
2095 free_percpu(sdd->sd);
2096 sdd->sd = NULL;
2097 free_percpu(sdd->sds);
2098 sdd->sds = NULL;
2099 free_percpu(sdd->sg);
2100 sdd->sg = NULL;
2101 free_percpu(sdd->sgc);
2102 sdd->sgc = NULL;
2103 }
2104}
2105
2106static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
2107 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
2108 struct sched_domain *child, int cpu)
2109{
2110 struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
2111
2112 if (child) {
2113 sd->level = child->level + 1;
2114 sched_domain_level_max = max(sched_domain_level_max, sd->level);
2115 child->parent = sd;
2116
2117 if (!cpumask_subset(sched_domain_span(child),
2118 sched_domain_span(sd))) {
2119 pr_err("BUG: arch topology borken\n");
2120#ifdef CONFIG_SCHED_DEBUG
2121 pr_err(" the %s domain not a subset of the %s domain\n",
2122 child->name, sd->name);
2123#endif
2124
2125 cpumask_or(sched_domain_span(sd),
2126 sched_domain_span(sd),
2127 sched_domain_span(child));
2128 }
2129
2130 }
2131 set_domain_attribute(sd, attr);
2132
2133 return sd;
2134}
2135
2136
2137
2138
2139
2140static bool topology_span_sane(struct sched_domain_topology_level *tl,
2141 const struct cpumask *cpu_map, int cpu)
2142{
2143 int i;
2144
2145
2146 if (tl->flags & SDTL_OVERLAP)
2147 return true;
2148
2149
2150
2151
2152
2153
2154
2155 for_each_cpu(i, cpu_map) {
2156 if (i == cpu)
2157 continue;
2158
2159
2160
2161
2162
2163
2164 if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
2165 cpumask_intersects(tl->mask(cpu), tl->mask(i)))
2166 return false;
2167 }
2168
2169 return true;
2170}
2171
2172
2173
2174
2175
2176static int
2177build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
2178{
2179 enum s_alloc alloc_state = sa_none;
2180 struct sched_domain *sd;
2181 struct s_data d;
2182 struct rq *rq = NULL;
2183 int i, ret = -ENOMEM;
2184 bool has_asym = false;
2185
2186 if (WARN_ON(cpumask_empty(cpu_map)))
2187 goto error;
2188
2189 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
2190 if (alloc_state != sa_rootdomain)
2191 goto error;
2192
2193
2194 for_each_cpu(i, cpu_map) {
2195 struct sched_domain_topology_level *tl;
2196
2197 sd = NULL;
2198 for_each_sd_topology(tl) {
2199
2200 if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
2201 goto error;
2202
2203 sd = build_sched_domain(tl, cpu_map, attr, sd, i);
2204
2205 has_asym |= sd->flags & SD_ASYM_CPUCAPACITY;
2206
2207 if (tl == sched_domain_topology)
2208 *per_cpu_ptr(d.sd, i) = sd;
2209 if (tl->flags & SDTL_OVERLAP)
2210 sd->flags |= SD_OVERLAP;
2211 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
2212 break;
2213 }
2214 }
2215
2216
2217 for_each_cpu(i, cpu_map) {
2218 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
2219 sd->span_weight = cpumask_weight(sched_domain_span(sd));
2220 if (sd->flags & SD_OVERLAP) {
2221 if (build_overlap_sched_groups(sd, i))
2222 goto error;
2223 } else {
2224 if (build_sched_groups(sd, i))
2225 goto error;
2226 }
2227 }
2228 }
2229
2230
2231 for (i = nr_cpumask_bits-1; i >= 0; i--) {
2232 if (!cpumask_test_cpu(i, cpu_map))
2233 continue;
2234
2235 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
2236 claim_allocations(i, sd);
2237 init_sched_groups_capacity(i, sd);
2238 }
2239 }
2240
2241
2242 rcu_read_lock();
2243 for_each_cpu(i, cpu_map) {
2244 rq = cpu_rq(i);
2245 sd = *per_cpu_ptr(d.sd, i);
2246
2247
2248 if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
2249 WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
2250
2251 cpu_attach_domain(sd, d.rd, i);
2252 }
2253 rcu_read_unlock();
2254
2255 if (has_asym)
2256 static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
2257
2258 if (rq && sched_debug_verbose) {
2259 pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
2260 cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
2261 }
2262
2263 ret = 0;
2264error:
2265 __free_domain_allocs(&d, alloc_state, cpu_map);
2266
2267 return ret;
2268}
2269
2270
2271static cpumask_var_t *doms_cur;
2272
2273
2274static int ndoms_cur;
2275
2276
2277static struct sched_domain_attr *dattr_cur;
2278
2279
2280
2281
2282
2283
2284static cpumask_var_t fallback_doms;
2285
2286
2287
2288
2289
2290
2291int __weak arch_update_cpu_topology(void)
2292{
2293 return 0;
2294}
2295
2296cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
2297{
2298 int i;
2299 cpumask_var_t *doms;
2300
2301 doms = kmalloc_array(ndoms, sizeof(*doms), GFP_KERNEL);
2302 if (!doms)
2303 return NULL;
2304 for (i = 0; i < ndoms; i++) {
2305 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
2306 free_sched_domains(doms, i);
2307 return NULL;
2308 }
2309 }
2310 return doms;
2311}
2312
2313void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
2314{
2315 unsigned int i;
2316 for (i = 0; i < ndoms; i++)
2317 free_cpumask_var(doms[i]);
2318 kfree(doms);
2319}
2320
2321
2322
2323
2324
2325int sched_init_domains(const struct cpumask *cpu_map)
2326{
2327 int err;
2328
2329 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
2330 zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
2331 zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
2332
2333 arch_update_cpu_topology();
2334 asym_cpu_capacity_scan();
2335 ndoms_cur = 1;
2336 doms_cur = alloc_sched_domains(ndoms_cur);
2337 if (!doms_cur)
2338 doms_cur = &fallback_doms;
2339 cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
2340 err = build_sched_domains(doms_cur[0], NULL);
2341
2342 return err;
2343}
2344
2345
2346
2347
2348
2349static void detach_destroy_domains(const struct cpumask *cpu_map)
2350{
2351 unsigned int cpu = cpumask_any(cpu_map);
2352 int i;
2353
2354 if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
2355 static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
2356
2357 rcu_read_lock();
2358 for_each_cpu(i, cpu_map)
2359 cpu_attach_domain(NULL, &def_root_domain, i);
2360 rcu_read_unlock();
2361}
2362
2363
2364static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
2365 struct sched_domain_attr *new, int idx_new)
2366{
2367 struct sched_domain_attr tmp;
2368
2369
2370 if (!new && !cur)
2371 return 1;
2372
2373 tmp = SD_ATTR_INIT;
2374
2375 return !memcmp(cur ? (cur + idx_cur) : &tmp,
2376 new ? (new + idx_new) : &tmp,
2377 sizeof(struct sched_domain_attr));
2378}
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
2407 struct sched_domain_attr *dattr_new)
2408{
2409 bool __maybe_unused has_eas = false;
2410 int i, j, n;
2411 int new_topology;
2412
2413 lockdep_assert_held(&sched_domains_mutex);
2414
2415
2416 new_topology = arch_update_cpu_topology();
2417
2418 if (new_topology)
2419 asym_cpu_capacity_scan();
2420
2421 if (!doms_new) {
2422 WARN_ON_ONCE(dattr_new);
2423 n = 0;
2424 doms_new = alloc_sched_domains(1);
2425 if (doms_new) {
2426 n = 1;
2427 cpumask_and(doms_new[0], cpu_active_mask,
2428 housekeeping_cpumask(HK_FLAG_DOMAIN));
2429 }
2430 } else {
2431 n = ndoms_new;
2432 }
2433
2434
2435 for (i = 0; i < ndoms_cur; i++) {
2436 for (j = 0; j < n && !new_topology; j++) {
2437 if (cpumask_equal(doms_cur[i], doms_new[j]) &&
2438 dattrs_equal(dattr_cur, i, dattr_new, j)) {
2439 struct root_domain *rd;
2440
2441
2442
2443
2444
2445
2446
2447 rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
2448 dl_clear_root_domain(rd);
2449 goto match1;
2450 }
2451 }
2452
2453 detach_destroy_domains(doms_cur[i]);
2454match1:
2455 ;
2456 }
2457
2458 n = ndoms_cur;
2459 if (!doms_new) {
2460 n = 0;
2461 doms_new = &fallback_doms;
2462 cpumask_and(doms_new[0], cpu_active_mask,
2463 housekeeping_cpumask(HK_FLAG_DOMAIN));
2464 }
2465
2466
2467 for (i = 0; i < ndoms_new; i++) {
2468 for (j = 0; j < n && !new_topology; j++) {
2469 if (cpumask_equal(doms_new[i], doms_cur[j]) &&
2470 dattrs_equal(dattr_new, i, dattr_cur, j))
2471 goto match2;
2472 }
2473
2474 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
2475match2:
2476 ;
2477 }
2478
2479#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
2480
2481 for (i = 0; i < ndoms_new; i++) {
2482 for (j = 0; j < n && !sched_energy_update; j++) {
2483 if (cpumask_equal(doms_new[i], doms_cur[j]) &&
2484 cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {
2485 has_eas = true;
2486 goto match3;
2487 }
2488 }
2489
2490 has_eas |= build_perf_domains(doms_new[i]);
2491match3:
2492 ;
2493 }
2494 sched_energy_set(has_eas);
2495#endif
2496
2497
2498 if (doms_cur != &fallback_doms)
2499 free_sched_domains(doms_cur, ndoms_cur);
2500
2501 kfree(dattr_cur);
2502 doms_cur = doms_new;
2503 dattr_cur = dattr_new;
2504 ndoms_cur = ndoms_new;
2505
2506 update_sched_domain_debugfs();
2507}
2508
2509
2510
2511
2512void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
2513 struct sched_domain_attr *dattr_new)
2514{
2515 mutex_lock(&sched_domains_mutex);
2516 partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
2517 mutex_unlock(&sched_domains_mutex);
2518}
2519