1
2
3
4
5#include "sched.h"
6
7DEFINE_MUTEX(sched_domains_mutex);
8
9
10cpumask_var_t sched_domains_tmpmask;
11cpumask_var_t sched_domains_tmpmask2;
12
13#ifdef CONFIG_SCHED_DEBUG
14
15static int __init sched_debug_setup(char *str)
16{
17 sched_debug_enabled = true;
18
19 return 0;
20}
21early_param("sched_debug", sched_debug_setup);
22
23static inline bool sched_debug(void)
24{
25 return sched_debug_enabled;
26}
27
28static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
29 struct cpumask *groupmask)
30{
31 struct sched_group *group = sd->groups;
32
33 cpumask_clear(groupmask);
34
35 printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
36
37 if (!(sd->flags & SD_LOAD_BALANCE)) {
38 printk("does not load-balance\n");
39 if (sd->parent)
40 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
41 return -1;
42 }
43
44 printk(KERN_CONT "span=%*pbl level=%s\n",
45 cpumask_pr_args(sched_domain_span(sd)), sd->name);
46
47 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
48 printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
49 }
50 if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) {
51 printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
52 }
53
54 printk(KERN_DEBUG "%*s groups:", level + 1, "");
55 do {
56 if (!group) {
57 printk("\n");
58 printk(KERN_ERR "ERROR: group is NULL\n");
59 break;
60 }
61
62 if (!cpumask_weight(sched_group_span(group))) {
63 printk(KERN_CONT "\n");
64 printk(KERN_ERR "ERROR: empty group\n");
65 break;
66 }
67
68 if (!(sd->flags & SD_OVERLAP) &&
69 cpumask_intersects(groupmask, sched_group_span(group))) {
70 printk(KERN_CONT "\n");
71 printk(KERN_ERR "ERROR: repeated CPUs\n");
72 break;
73 }
74
75 cpumask_or(groupmask, groupmask, sched_group_span(group));
76
77 printk(KERN_CONT " %d:{ span=%*pbl",
78 group->sgc->id,
79 cpumask_pr_args(sched_group_span(group)));
80
81 if ((sd->flags & SD_OVERLAP) &&
82 !cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
83 printk(KERN_CONT " mask=%*pbl",
84 cpumask_pr_args(group_balance_mask(group)));
85 }
86
87 if (group->sgc->capacity != SCHED_CAPACITY_SCALE)
88 printk(KERN_CONT " cap=%lu", group->sgc->capacity);
89
90 if (group == sd->groups && sd->child &&
91 !cpumask_equal(sched_domain_span(sd->child),
92 sched_group_span(group))) {
93 printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
94 }
95
96 printk(KERN_CONT " }");
97
98 group = group->next;
99
100 if (group != sd->groups)
101 printk(KERN_CONT ",");
102
103 } while (group != sd->groups);
104 printk(KERN_CONT "\n");
105
106 if (!cpumask_equal(sched_domain_span(sd), groupmask))
107 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
108
109 if (sd->parent &&
110 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
111 printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
112 return 0;
113}
114
115static void sched_domain_debug(struct sched_domain *sd, int cpu)
116{
117 int level = 0;
118
119 if (!sched_debug_enabled)
120 return;
121
122 if (!sd) {
123 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
124 return;
125 }
126
127 printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);
128
129 for (;;) {
130 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
131 break;
132 level++;
133 sd = sd->parent;
134 if (!sd)
135 break;
136 }
137}
138#else
139
140# define sched_debug_enabled 0
141# define sched_domain_debug(sd, cpu) do { } while (0)
142static inline bool sched_debug(void)
143{
144 return false;
145}
146#endif
147
148static int sd_degenerate(struct sched_domain *sd)
149{
150 if (cpumask_weight(sched_domain_span(sd)) == 1)
151 return 1;
152
153
154 if (sd->flags & (SD_LOAD_BALANCE |
155 SD_BALANCE_NEWIDLE |
156 SD_BALANCE_FORK |
157 SD_BALANCE_EXEC |
158 SD_SHARE_CPUCAPACITY |
159 SD_ASYM_CPUCAPACITY |
160 SD_SHARE_PKG_RESOURCES |
161 SD_SHARE_POWERDOMAIN)) {
162 if (sd->groups != sd->groups->next)
163 return 0;
164 }
165
166
167 if (sd->flags & (SD_WAKE_AFFINE))
168 return 0;
169
170 return 1;
171}
172
173static int
174sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
175{
176 unsigned long cflags = sd->flags, pflags = parent->flags;
177
178 if (sd_degenerate(parent))
179 return 1;
180
181 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
182 return 0;
183
184
185 if (parent->groups == parent->groups->next) {
186 pflags &= ~(SD_LOAD_BALANCE |
187 SD_BALANCE_NEWIDLE |
188 SD_BALANCE_FORK |
189 SD_BALANCE_EXEC |
190 SD_ASYM_CPUCAPACITY |
191 SD_SHARE_CPUCAPACITY |
192 SD_SHARE_PKG_RESOURCES |
193 SD_PREFER_SIBLING |
194 SD_SHARE_POWERDOMAIN);
195 if (nr_node_ids == 1)
196 pflags &= ~SD_SERIALIZE;
197 }
198 if (~cflags & pflags)
199 return 0;
200
201 return 1;
202}
203
204DEFINE_STATIC_KEY_FALSE(sched_energy_present);
205#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
206unsigned int sysctl_sched_energy_aware = 1;
207DEFINE_MUTEX(sched_energy_mutex);
208bool sched_energy_update;
209
210void rebuild_sched_domains_energy(void)
211{
212 mutex_lock(&sched_energy_mutex);
213 sched_energy_update = true;
214 rebuild_sched_domains();
215 sched_energy_update = false;
216 mutex_unlock(&sched_energy_mutex);
217}
218
219#ifdef CONFIG_PROC_SYSCTL
220int sched_energy_aware_handler(struct ctl_table *table, int write,
221 void __user *buffer, size_t *lenp, loff_t *ppos)
222{
223 int ret, state;
224
225 if (write && !capable(CAP_SYS_ADMIN))
226 return -EPERM;
227
228 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
229 if (!ret && write) {
230 state = static_branch_unlikely(&sched_energy_present);
231 if (state != sysctl_sched_energy_aware)
232 rebuild_sched_domains_energy();
233 }
234
235 return ret;
236}
237#endif
238
239static void free_pd(struct perf_domain *pd)
240{
241 struct perf_domain *tmp;
242
243 while (pd) {
244 tmp = pd->next;
245 kfree(pd);
246 pd = tmp;
247 }
248}
249
250static struct perf_domain *find_pd(struct perf_domain *pd, int cpu)
251{
252 while (pd) {
253 if (cpumask_test_cpu(cpu, perf_domain_span(pd)))
254 return pd;
255 pd = pd->next;
256 }
257
258 return NULL;
259}
260
261static struct perf_domain *pd_init(int cpu)
262{
263 struct em_perf_domain *obj = em_cpu_get(cpu);
264 struct perf_domain *pd;
265
266 if (!obj) {
267 if (sched_debug())
268 pr_info("%s: no EM found for CPU%d\n", __func__, cpu);
269 return NULL;
270 }
271
272 pd = kzalloc(sizeof(*pd), GFP_KERNEL);
273 if (!pd)
274 return NULL;
275 pd->em_pd = obj;
276
277 return pd;
278}
279
280static void perf_domain_debug(const struct cpumask *cpu_map,
281 struct perf_domain *pd)
282{
283 if (!sched_debug() || !pd)
284 return;
285
286 printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
287
288 while (pd) {
289 printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }",
290 cpumask_first(perf_domain_span(pd)),
291 cpumask_pr_args(perf_domain_span(pd)),
292 em_pd_nr_cap_states(pd->em_pd));
293 pd = pd->next;
294 }
295
296 printk(KERN_CONT "\n");
297}
298
299static void destroy_perf_domain_rcu(struct rcu_head *rp)
300{
301 struct perf_domain *pd;
302
303 pd = container_of(rp, struct perf_domain, rcu);
304 free_pd(pd);
305}
306
307static void sched_energy_set(bool has_eas)
308{
309 if (!has_eas && static_branch_unlikely(&sched_energy_present)) {
310 if (sched_debug())
311 pr_info("%s: stopping EAS\n", __func__);
312 static_branch_disable_cpuslocked(&sched_energy_present);
313 } else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
314 if (sched_debug())
315 pr_info("%s: starting EAS\n", __func__);
316 static_branch_enable_cpuslocked(&sched_energy_present);
317 }
318}
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345#define EM_MAX_COMPLEXITY 2048
346
347extern struct cpufreq_governor schedutil_gov;
348static bool build_perf_domains(const struct cpumask *cpu_map)
349{
350 int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map);
351 struct perf_domain *pd = NULL, *tmp;
352 int cpu = cpumask_first(cpu_map);
353 struct root_domain *rd = cpu_rq(cpu)->rd;
354 struct cpufreq_policy *policy;
355 struct cpufreq_governor *gov;
356
357 if (!sysctl_sched_energy_aware)
358 goto free;
359
360
361 if (!per_cpu(sd_asym_cpucapacity, cpu)) {
362 if (sched_debug()) {
363 pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
364 cpumask_pr_args(cpu_map));
365 }
366 goto free;
367 }
368
369
370 if (sched_smt_active()) {
371 pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n",
372 cpumask_pr_args(cpu_map));
373 goto free;
374 }
375
376 if (!arch_scale_freq_invariant()) {
377 if (sched_debug()) {
378 pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported",
379 cpumask_pr_args(cpu_map));
380 }
381 goto free;
382 }
383
384 for_each_cpu(i, cpu_map) {
385
386 if (find_pd(pd, i))
387 continue;
388
389
390 policy = cpufreq_cpu_get(i);
391 if (!policy)
392 goto free;
393 gov = policy->governor;
394 cpufreq_cpu_put(policy);
395 if (gov != &schedutil_gov) {
396 if (rd->pd)
397 pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n",
398 cpumask_pr_args(cpu_map));
399 goto free;
400 }
401
402
403 tmp = pd_init(i);
404 if (!tmp)
405 goto free;
406 tmp->next = pd;
407 pd = tmp;
408
409
410
411
412
413 nr_pd++;
414 nr_cs += em_pd_nr_cap_states(pd->em_pd);
415 }
416
417
418 if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) {
419 WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
420 cpumask_pr_args(cpu_map));
421 goto free;
422 }
423
424 perf_domain_debug(cpu_map, pd);
425
426
427 tmp = rd->pd;
428 rcu_assign_pointer(rd->pd, pd);
429 if (tmp)
430 call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
431
432 return !!pd;
433
434free:
435 free_pd(pd);
436 tmp = rd->pd;
437 rcu_assign_pointer(rd->pd, NULL);
438 if (tmp)
439 call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
440
441 return false;
442}
443#else
444static void free_pd(struct perf_domain *pd) { }
445#endif
446
447static void free_rootdomain(struct rcu_head *rcu)
448{
449 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
450
451 cpupri_cleanup(&rd->cpupri);
452 cpudl_cleanup(&rd->cpudl);
453 free_cpumask_var(rd->dlo_mask);
454 free_cpumask_var(rd->rto_mask);
455 free_cpumask_var(rd->online);
456 free_cpumask_var(rd->span);
457 free_pd(rd->pd);
458 kfree(rd);
459}
460
461void rq_attach_root(struct rq *rq, struct root_domain *rd)
462{
463 struct root_domain *old_rd = NULL;
464 unsigned long flags;
465
466 raw_spin_lock_irqsave(&rq->lock, flags);
467
468 if (rq->rd) {
469 old_rd = rq->rd;
470
471 if (cpumask_test_cpu(rq->cpu, old_rd->online))
472 set_rq_offline(rq);
473
474 cpumask_clear_cpu(rq->cpu, old_rd->span);
475
476
477
478
479
480
481 if (!atomic_dec_and_test(&old_rd->refcount))
482 old_rd = NULL;
483 }
484
485 atomic_inc(&rd->refcount);
486 rq->rd = rd;
487
488 cpumask_set_cpu(rq->cpu, rd->span);
489 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
490 set_rq_online(rq);
491
492 raw_spin_unlock_irqrestore(&rq->lock, flags);
493
494 if (old_rd)
495 call_rcu(&old_rd->rcu, free_rootdomain);
496}
497
498void sched_get_rd(struct root_domain *rd)
499{
500 atomic_inc(&rd->refcount);
501}
502
503void sched_put_rd(struct root_domain *rd)
504{
505 if (!atomic_dec_and_test(&rd->refcount))
506 return;
507
508 call_rcu(&rd->rcu, free_rootdomain);
509}
510
511static int init_rootdomain(struct root_domain *rd)
512{
513 if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
514 goto out;
515 if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
516 goto free_span;
517 if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
518 goto free_online;
519 if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
520 goto free_dlo_mask;
521
522#ifdef HAVE_RT_PUSH_IPI
523 rd->rto_cpu = -1;
524 raw_spin_lock_init(&rd->rto_lock);
525 init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
526#endif
527
528 rd->visit_gen = 0;
529 init_dl_bw(&rd->dl_bw);
530 if (cpudl_init(&rd->cpudl) != 0)
531 goto free_rto_mask;
532
533 if (cpupri_init(&rd->cpupri) != 0)
534 goto free_cpudl;
535 return 0;
536
537free_cpudl:
538 cpudl_cleanup(&rd->cpudl);
539free_rto_mask:
540 free_cpumask_var(rd->rto_mask);
541free_dlo_mask:
542 free_cpumask_var(rd->dlo_mask);
543free_online:
544 free_cpumask_var(rd->online);
545free_span:
546 free_cpumask_var(rd->span);
547out:
548 return -ENOMEM;
549}
550
551
552
553
554
555struct root_domain def_root_domain;
556
557void init_defrootdomain(void)
558{
559 init_rootdomain(&def_root_domain);
560
561 atomic_set(&def_root_domain.refcount, 1);
562}
563
564static struct root_domain *alloc_rootdomain(void)
565{
566 struct root_domain *rd;
567
568 rd = kzalloc(sizeof(*rd), GFP_KERNEL);
569 if (!rd)
570 return NULL;
571
572 if (init_rootdomain(rd) != 0) {
573 kfree(rd);
574 return NULL;
575 }
576
577 return rd;
578}
579
580static void free_sched_groups(struct sched_group *sg, int free_sgc)
581{
582 struct sched_group *tmp, *first;
583
584 if (!sg)
585 return;
586
587 first = sg;
588 do {
589 tmp = sg->next;
590
591 if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
592 kfree(sg->sgc);
593
594 if (atomic_dec_and_test(&sg->ref))
595 kfree(sg);
596 sg = tmp;
597 } while (sg != first);
598}
599
600static void destroy_sched_domain(struct sched_domain *sd)
601{
602
603
604
605
606
607 free_sched_groups(sd->groups, 1);
608
609 if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
610 kfree(sd->shared);
611 kfree(sd);
612}
613
614static void destroy_sched_domains_rcu(struct rcu_head *rcu)
615{
616 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
617
618 while (sd) {
619 struct sched_domain *parent = sd->parent;
620 destroy_sched_domain(sd);
621 sd = parent;
622 }
623}
624
625static void destroy_sched_domains(struct sched_domain *sd)
626{
627 if (sd)
628 call_rcu(&sd->rcu, destroy_sched_domains_rcu);
629}
630
631
632
633
634
635
636
637
638
639
640DEFINE_PER_CPU(struct sched_domain *, sd_llc);
641DEFINE_PER_CPU(int, sd_llc_size);
642DEFINE_PER_CPU(int, sd_llc_id);
643DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
644DEFINE_PER_CPU(struct sched_domain *, sd_numa);
645DEFINE_PER_CPU(struct sched_domain *, sd_asym_packing);
646DEFINE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity);
647DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
648
649static void update_top_cache_domain(int cpu)
650{
651 struct sched_domain_shared *sds = NULL;
652 struct sched_domain *sd;
653 int id = cpu;
654 int size = 1;
655
656 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
657 if (sd) {
658 id = cpumask_first(sched_domain_span(sd));
659 size = cpumask_weight(sched_domain_span(sd));
660 sds = sd->shared;
661 }
662
663 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
664 per_cpu(sd_llc_size, cpu) = size;
665 per_cpu(sd_llc_id, cpu) = id;
666 rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
667
668 sd = lowest_flag_domain(cpu, SD_NUMA);
669 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
670
671 sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
672 rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
673
674 sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY);
675 rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
676}
677
678
679
680
681
682static void
683cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
684{
685 struct rq *rq = cpu_rq(cpu);
686 struct sched_domain *tmp;
687 int numa_distance = 0;
688
689
690 for (tmp = sd; tmp; ) {
691 struct sched_domain *parent = tmp->parent;
692 if (!parent)
693 break;
694
695 if (sd_parent_degenerate(tmp, parent)) {
696 tmp->parent = parent->parent;
697 if (parent->parent)
698 parent->parent->child = tmp;
699
700
701
702
703
704 if (parent->flags & SD_PREFER_SIBLING)
705 tmp->flags |= SD_PREFER_SIBLING;
706 destroy_sched_domain(parent);
707 } else
708 tmp = tmp->parent;
709 }
710
711 if (sd && sd_degenerate(sd)) {
712 tmp = sd;
713 sd = sd->parent;
714 destroy_sched_domain(tmp);
715 if (sd) {
716 struct sched_group *sg = sd->groups;
717
718
719
720
721
722
723 do {
724 sg->flags = 0;
725 } while (sg != sd->groups);
726
727 sd->child = NULL;
728 }
729 }
730
731 for (tmp = sd; tmp; tmp = tmp->parent)
732 numa_distance += !!(tmp->flags & SD_NUMA);
733
734 sched_domain_debug(sd, cpu);
735
736 rq_attach_root(rq, rd);
737 tmp = rq->sd;
738 rcu_assign_pointer(rq->sd, sd);
739 dirty_sched_domain_sysctl(cpu);
740 destroy_sched_domains(tmp);
741
742 update_top_cache_domain(cpu);
743}
744
745struct s_data {
746 struct sched_domain * __percpu *sd;
747 struct root_domain *rd;
748};
749
750enum s_alloc {
751 sa_rootdomain,
752 sa_sd,
753 sa_sd_storage,
754 sa_none,
755};
756
757
758
759
760
761
762
763
764
765
766int group_balance_cpu(struct sched_group *sg)
767{
768 return cpumask_first(group_balance_mask(sg));
769}
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877static void
878build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
879{
880 const struct cpumask *sg_span = sched_group_span(sg);
881 struct sd_data *sdd = sd->private;
882 struct sched_domain *sibling;
883 int i;
884
885 cpumask_clear(mask);
886
887 for_each_cpu(i, sg_span) {
888 sibling = *per_cpu_ptr(sdd->sd, i);
889
890
891
892
893
894
895 if (!sibling->child)
896 continue;
897
898
899 if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
900 continue;
901
902 cpumask_set_cpu(i, mask);
903 }
904
905
906 WARN_ON_ONCE(cpumask_empty(mask));
907}
908
909
910
911
912
913
914static struct sched_group *
915build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
916{
917 struct sched_group *sg;
918 struct cpumask *sg_span;
919
920 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
921 GFP_KERNEL, cpu_to_node(cpu));
922
923 if (!sg)
924 return NULL;
925
926 sg_span = sched_group_span(sg);
927 if (sd->child) {
928 cpumask_copy(sg_span, sched_domain_span(sd->child));
929 sg->flags = sd->child->flags;
930 } else {
931 cpumask_copy(sg_span, sched_domain_span(sd));
932 }
933
934 atomic_inc(&sg->ref);
935 return sg;
936}
937
938static void init_overlap_sched_group(struct sched_domain *sd,
939 struct sched_group *sg)
940{
941 struct cpumask *mask = sched_domains_tmpmask2;
942 struct sd_data *sdd = sd->private;
943 struct cpumask *sg_span;
944 int cpu;
945
946 build_balance_mask(sd, sg, mask);
947 cpu = cpumask_first(mask);
948
949 sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
950 if (atomic_inc_return(&sg->sgc->ref) == 1)
951 cpumask_copy(group_balance_mask(sg), mask);
952 else
953 WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask));
954
955
956
957
958
959
960 sg_span = sched_group_span(sg);
961 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
962 sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
963 sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
964}
965
966static struct sched_domain *
967find_descended_sibling(struct sched_domain *sd, struct sched_domain *sibling)
968{
969
970
971
972
973 while (sibling->child &&
974 !cpumask_subset(sched_domain_span(sibling->child),
975 sched_domain_span(sd)))
976 sibling = sibling->child;
977
978
979
980
981
982
983 while (sibling->child &&
984 cpumask_equal(sched_domain_span(sibling->child),
985 sched_domain_span(sibling)))
986 sibling = sibling->child;
987
988 return sibling;
989}
990
991static int
992build_overlap_sched_groups(struct sched_domain *sd, int cpu)
993{
994 struct sched_group *first = NULL, *last = NULL, *sg;
995 const struct cpumask *span = sched_domain_span(sd);
996 struct cpumask *covered = sched_domains_tmpmask;
997 struct sd_data *sdd = sd->private;
998 struct sched_domain *sibling;
999 int i;
1000
1001 cpumask_clear(covered);
1002
1003 for_each_cpu_wrap(i, span, cpu) {
1004 struct cpumask *sg_span;
1005
1006 if (cpumask_test_cpu(i, covered))
1007 continue;
1008
1009 sibling = *per_cpu_ptr(sdd->sd, i);
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
1022 continue;
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055 if (sibling->child &&
1056 !cpumask_subset(sched_domain_span(sibling->child), span))
1057 sibling = find_descended_sibling(sd, sibling);
1058
1059 sg = build_group_from_child_sched_domain(sibling, cpu);
1060 if (!sg)
1061 goto fail;
1062
1063 sg_span = sched_group_span(sg);
1064 cpumask_or(covered, covered, sg_span);
1065
1066 init_overlap_sched_group(sibling, sg);
1067
1068 if (!first)
1069 first = sg;
1070 if (last)
1071 last->next = sg;
1072 last = sg;
1073 last->next = first;
1074 }
1075 sd->groups = first;
1076
1077 return 0;
1078
1079fail:
1080 free_sched_groups(first, 0);
1081
1082 return -ENOMEM;
1083}
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157static struct sched_group *get_group(int cpu, struct sd_data *sdd)
1158{
1159 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
1160 struct sched_domain *child = sd->child;
1161 struct sched_group *sg;
1162
1163 if (child)
1164 cpu = cpumask_first(sched_domain_span(child));
1165
1166 sg = *per_cpu_ptr(sdd->sg, cpu);
1167 sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
1168
1169
1170 atomic_inc(&sg->ref);
1171 atomic_inc(&sg->sgc->ref);
1172
1173 if (child) {
1174 cpumask_copy(sched_group_span(sg), sched_domain_span(child));
1175 cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
1176 sg->flags = child->flags;
1177 } else {
1178 cpumask_set_cpu(cpu, sched_group_span(sg));
1179 cpumask_set_cpu(cpu, group_balance_mask(sg));
1180 }
1181
1182 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
1183 sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
1184 sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
1185
1186 return sg;
1187}
1188
1189
1190
1191
1192
1193
1194
1195
1196static int
1197build_sched_groups(struct sched_domain *sd, int cpu)
1198{
1199 struct sched_group *first = NULL, *last = NULL;
1200 struct sd_data *sdd = sd->private;
1201 const struct cpumask *span = sched_domain_span(sd);
1202 struct cpumask *covered;
1203 int i;
1204
1205 lockdep_assert_held(&sched_domains_mutex);
1206 covered = sched_domains_tmpmask;
1207
1208 cpumask_clear(covered);
1209
1210 for_each_cpu_wrap(i, span, cpu) {
1211 struct sched_group *sg;
1212
1213 if (cpumask_test_cpu(i, covered))
1214 continue;
1215
1216 sg = get_group(i, sdd);
1217
1218 cpumask_or(covered, covered, sched_group_span(sg));
1219
1220 if (!first)
1221 first = sg;
1222 if (last)
1223 last->next = sg;
1224 last = sg;
1225 }
1226 last->next = first;
1227 sd->groups = first;
1228
1229 return 0;
1230}
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
1243{
1244 struct sched_group *sg = sd->groups;
1245
1246 WARN_ON(!sg);
1247
1248 do {
1249 int cpu, max_cpu = -1;
1250
1251 sg->group_weight = cpumask_weight(sched_group_span(sg));
1252
1253 if (!(sd->flags & SD_ASYM_PACKING))
1254 goto next;
1255
1256 for_each_cpu(cpu, sched_group_span(sg)) {
1257 if (max_cpu < 0)
1258 max_cpu = cpu;
1259 else if (sched_asym_prefer(cpu, max_cpu))
1260 max_cpu = cpu;
1261 }
1262 sg->asym_prefer_cpu = max_cpu;
1263
1264next:
1265 sg = sg->next;
1266 } while (sg != sd->groups);
1267
1268 if (cpu != group_balance_cpu(sg))
1269 return;
1270
1271 update_group_capacity(sd, cpu);
1272}
1273
1274
1275
1276
1277
1278
1279static int default_relax_domain_level = -1;
1280int sched_domain_level_max;
1281
1282static int __init setup_relax_domain_level(char *str)
1283{
1284 if (kstrtoint(str, 0, &default_relax_domain_level))
1285 pr_warn("Unable to set relax_domain_level\n");
1286
1287 return 1;
1288}
1289__setup("relax_domain_level=", setup_relax_domain_level);
1290
1291static void set_domain_attribute(struct sched_domain *sd,
1292 struct sched_domain_attr *attr)
1293{
1294 int request;
1295
1296 if (!attr || attr->relax_domain_level < 0) {
1297 if (default_relax_domain_level < 0)
1298 return;
1299 request = default_relax_domain_level;
1300 } else
1301 request = attr->relax_domain_level;
1302
1303 if (sd->level > request) {
1304
1305 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
1306 }
1307}
1308
1309static void __sdt_free(const struct cpumask *cpu_map);
1310static int __sdt_alloc(const struct cpumask *cpu_map);
1311
1312static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
1313 const struct cpumask *cpu_map)
1314{
1315 switch (what) {
1316 case sa_rootdomain:
1317 if (!atomic_read(&d->rd->refcount))
1318 free_rootdomain(&d->rd->rcu);
1319 fallthrough;
1320 case sa_sd:
1321 free_percpu(d->sd);
1322 fallthrough;
1323 case sa_sd_storage:
1324 __sdt_free(cpu_map);
1325 fallthrough;
1326 case sa_none:
1327 break;
1328 }
1329}
1330
1331static enum s_alloc
1332__visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
1333{
1334 memset(d, 0, sizeof(*d));
1335
1336 if (__sdt_alloc(cpu_map))
1337 return sa_sd_storage;
1338 d->sd = alloc_percpu(struct sched_domain *);
1339 if (!d->sd)
1340 return sa_sd_storage;
1341 d->rd = alloc_rootdomain();
1342 if (!d->rd)
1343 return sa_sd;
1344
1345 return sa_rootdomain;
1346}
1347
1348
1349
1350
1351
1352
1353static void claim_allocations(int cpu, struct sched_domain *sd)
1354{
1355 struct sd_data *sdd = sd->private;
1356
1357 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
1358 *per_cpu_ptr(sdd->sd, cpu) = NULL;
1359
1360 if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
1361 *per_cpu_ptr(sdd->sds, cpu) = NULL;
1362
1363 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
1364 *per_cpu_ptr(sdd->sg, cpu) = NULL;
1365
1366 if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
1367 *per_cpu_ptr(sdd->sgc, cpu) = NULL;
1368}
1369
1370#ifdef CONFIG_NUMA
1371enum numa_topology_type sched_numa_topology_type;
1372
1373static int sched_domains_numa_levels;
1374static int sched_domains_curr_level;
1375
1376int sched_max_numa_distance;
1377static int *sched_domains_numa_distance;
1378static struct cpumask ***sched_domains_numa_masks;
1379int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
1380#endif
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399#define TOPOLOGY_SD_FLAGS \
1400 (SD_SHARE_CPUCAPACITY | \
1401 SD_SHARE_PKG_RESOURCES | \
1402 SD_NUMA | \
1403 SD_ASYM_PACKING | \
1404 SD_SHARE_POWERDOMAIN)
1405
1406static struct sched_domain *
1407sd_init(struct sched_domain_topology_level *tl,
1408 const struct cpumask *cpu_map,
1409 struct sched_domain *child, int dflags, int cpu)
1410{
1411 struct sd_data *sdd = &tl->data;
1412 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
1413 int sd_id, sd_weight, sd_flags = 0;
1414
1415#ifdef CONFIG_NUMA
1416
1417
1418
1419 sched_domains_curr_level = tl->numa_level;
1420#endif
1421
1422 sd_weight = cpumask_weight(tl->mask(cpu));
1423
1424 if (tl->sd_flags)
1425 sd_flags = (*tl->sd_flags)();
1426 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
1427 "wrong sd_flags in topology description\n"))
1428 sd_flags &= TOPOLOGY_SD_FLAGS;
1429
1430
1431 sd_flags |= dflags;
1432
1433 *sd = (struct sched_domain){
1434 .min_interval = sd_weight,
1435 .max_interval = 2*sd_weight,
1436 .busy_factor = 16,
1437 .imbalance_pct = 117,
1438
1439 .cache_nice_tries = 0,
1440
1441 .flags = 1*SD_LOAD_BALANCE
1442 | 1*SD_BALANCE_NEWIDLE
1443 | 1*SD_BALANCE_EXEC
1444 | 1*SD_BALANCE_FORK
1445 | 0*SD_BALANCE_WAKE
1446 | 1*SD_WAKE_AFFINE
1447 | 0*SD_SHARE_CPUCAPACITY
1448 | 0*SD_SHARE_PKG_RESOURCES
1449 | 0*SD_SERIALIZE
1450 | 1*SD_PREFER_SIBLING
1451 | 0*SD_NUMA
1452 | sd_flags
1453 ,
1454
1455 .last_balance = jiffies,
1456 .balance_interval = sd_weight,
1457 .max_newidle_lb_cost = 0,
1458 .next_decay_max_lb_cost = jiffies,
1459 .child = child,
1460#ifdef CONFIG_SCHED_DEBUG
1461 .name = tl->name,
1462#endif
1463 };
1464
1465 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
1466 sd_id = cpumask_first(sched_domain_span(sd));
1467
1468
1469
1470
1471
1472 if (sd->flags & SD_ASYM_CPUCAPACITY) {
1473 struct sched_domain *t = sd;
1474
1475
1476
1477
1478 if (sd->child)
1479 sd->child->flags &= ~SD_PREFER_SIBLING;
1480
1481 for_each_lower_domain(t)
1482 t->flags |= SD_BALANCE_WAKE;
1483 }
1484
1485 if (sd->flags & SD_SHARE_CPUCAPACITY) {
1486 sd->imbalance_pct = 110;
1487
1488 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
1489 sd->imbalance_pct = 117;
1490 sd->cache_nice_tries = 1;
1491
1492#ifdef CONFIG_NUMA
1493 } else if (sd->flags & SD_NUMA) {
1494 sd->cache_nice_tries = 2;
1495
1496 sd->flags &= ~SD_PREFER_SIBLING;
1497 sd->flags |= SD_SERIALIZE;
1498 if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
1499 sd->flags &= ~(SD_BALANCE_EXEC |
1500 SD_BALANCE_FORK |
1501 SD_WAKE_AFFINE);
1502 }
1503
1504#endif
1505 } else {
1506 sd->cache_nice_tries = 1;
1507 }
1508
1509
1510
1511
1512
1513 if (sd->flags & SD_SHARE_PKG_RESOURCES) {
1514 sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
1515 atomic_inc(&sd->shared->ref);
1516 atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
1517 }
1518
1519 sd->private = sdd;
1520
1521 return sd;
1522}
1523
1524
1525
1526
1527static struct sched_domain_topology_level default_topology[] = {
1528#ifdef CONFIG_SCHED_SMT
1529 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
1530#endif
1531
1532#ifdef CONFIG_SCHED_CLUSTER
1533 { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) },
1534#endif
1535
1536#ifdef CONFIG_SCHED_MC
1537 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
1538#endif
1539 { cpu_cpu_mask, SD_INIT_NAME(DIE) },
1540 { NULL, },
1541};
1542
1543static struct sched_domain_topology_level *sched_domain_topology =
1544 default_topology;
1545
1546#define for_each_sd_topology(tl) \
1547 for (tl = sched_domain_topology; tl->mask; tl++)
1548
1549void set_sched_topology(struct sched_domain_topology_level *tl)
1550{
1551 if (WARN_ON_ONCE(sched_smp_initialized))
1552 return;
1553
1554 sched_domain_topology = tl;
1555}
1556
1557#ifdef CONFIG_NUMA
1558
1559static const struct cpumask *sd_numa_mask(int cpu)
1560{
1561 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
1562}
1563
1564static void sched_numa_warn(const char *str)
1565{
1566 static int done = false;
1567 int i,j;
1568
1569 if (done)
1570 return;
1571
1572 done = true;
1573
1574 printk(KERN_WARNING "ERROR: %s\n\n", str);
1575
1576 for (i = 0; i < nr_node_ids; i++) {
1577 printk(KERN_WARNING " ");
1578 for (j = 0; j < nr_node_ids; j++)
1579 printk(KERN_CONT "%02d ", node_distance(i,j));
1580 printk(KERN_CONT "\n");
1581 }
1582 printk(KERN_WARNING "\n");
1583}
1584
1585bool find_numa_distance(int distance)
1586{
1587 int i;
1588
1589 if (distance == node_distance(0, 0))
1590 return true;
1591
1592 for (i = 0; i < sched_domains_numa_levels; i++) {
1593 if (sched_domains_numa_distance[i] == distance)
1594 return true;
1595 }
1596
1597 return false;
1598}
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619static void init_numa_topology_type(void)
1620{
1621 int a, b, c, n;
1622
1623 n = sched_max_numa_distance;
1624
1625 if (sched_domains_numa_levels <= 2) {
1626 sched_numa_topology_type = NUMA_DIRECT;
1627 return;
1628 }
1629
1630 for_each_online_node(a) {
1631 for_each_online_node(b) {
1632
1633 if (node_distance(a, b) < n)
1634 continue;
1635
1636
1637 for_each_online_node(c) {
1638 if (node_distance(a, c) < n &&
1639 node_distance(b, c) < n) {
1640 sched_numa_topology_type =
1641 NUMA_GLUELESS_MESH;
1642 return;
1643 }
1644 }
1645
1646 sched_numa_topology_type = NUMA_BACKPLANE;
1647 return;
1648 }
1649 }
1650}
1651
1652void sched_init_numa(void)
1653{
1654 int next_distance, curr_distance = node_distance(0, 0);
1655 struct sched_domain_topology_level *tl;
1656 int level = 0;
1657 int i, j, k;
1658
1659 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
1660 if (!sched_domains_numa_distance)
1661 return;
1662
1663
1664 sched_domains_numa_distance[level++] = curr_distance;
1665 sched_domains_numa_levels = level;
1666
1667
1668
1669
1670
1671
1672
1673
1674 next_distance = curr_distance;
1675 for (i = 0; i < nr_node_ids; i++) {
1676 for (j = 0; j < nr_node_ids; j++) {
1677 for (k = 0; k < nr_node_ids; k++) {
1678 int distance = node_distance(i, k);
1679
1680 if (distance > curr_distance &&
1681 (distance < next_distance ||
1682 next_distance == curr_distance))
1683 next_distance = distance;
1684
1685
1686
1687
1688
1689
1690 if (sched_debug() && node_distance(k, i) != distance)
1691 sched_numa_warn("Node-distance not symmetric");
1692
1693 if (sched_debug() && i && !find_numa_distance(distance))
1694 sched_numa_warn("Node-0 not representative");
1695 }
1696 if (next_distance != curr_distance) {
1697 sched_domains_numa_distance[level++] = next_distance;
1698 sched_domains_numa_levels = level;
1699 curr_distance = next_distance;
1700 } else break;
1701 }
1702
1703
1704
1705
1706 if (!sched_debug())
1707 break;
1708 }
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726 sched_domains_numa_levels = 0;
1727
1728 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
1729 if (!sched_domains_numa_masks)
1730 return;
1731
1732
1733
1734
1735
1736 for (i = 0; i < level; i++) {
1737 sched_domains_numa_masks[i] =
1738 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
1739 if (!sched_domains_numa_masks[i])
1740 return;
1741
1742 for (j = 0; j < nr_node_ids; j++) {
1743 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
1744 if (!mask)
1745 return;
1746
1747 sched_domains_numa_masks[i][j] = mask;
1748
1749 for_each_node(k) {
1750 if (node_distance(j, k) > sched_domains_numa_distance[i])
1751 continue;
1752
1753 cpumask_or(mask, mask, cpumask_of_node(k));
1754 }
1755 }
1756 }
1757
1758
1759 for (i = 0; sched_domain_topology[i].mask; i++);
1760
1761 tl = kzalloc((i + level + 1) *
1762 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
1763 if (!tl)
1764 return;
1765
1766
1767
1768
1769 for (i = 0; sched_domain_topology[i].mask; i++)
1770 tl[i] = sched_domain_topology[i];
1771
1772
1773
1774
1775 tl[i++] = (struct sched_domain_topology_level){
1776 .mask = sd_numa_mask,
1777 .numa_level = 0,
1778 SD_INIT_NAME(NODE)
1779 };
1780
1781
1782
1783
1784 for (j = 1; j < level; i++, j++) {
1785 tl[i] = (struct sched_domain_topology_level){
1786 .mask = sd_numa_mask,
1787 .sd_flags = cpu_numa_flags,
1788 .flags = SDTL_OVERLAP,
1789 .numa_level = j,
1790 SD_INIT_NAME(NUMA)
1791 };
1792 }
1793
1794 sched_domain_topology = tl;
1795
1796 sched_domains_numa_levels = level;
1797 sched_max_numa_distance = sched_domains_numa_distance[level - 1];
1798
1799 init_numa_topology_type();
1800}
1801
1802void sched_domains_numa_masks_set(unsigned int cpu)
1803{
1804 int node = cpu_to_node(cpu);
1805 int i, j;
1806
1807 for (i = 0; i < sched_domains_numa_levels; i++) {
1808 for (j = 0; j < nr_node_ids; j++) {
1809 if (node_distance(j, node) <= sched_domains_numa_distance[i])
1810 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
1811 }
1812 }
1813}
1814
1815void sched_domains_numa_masks_clear(unsigned int cpu)
1816{
1817 int i, j;
1818
1819 for (i = 0; i < sched_domains_numa_levels; i++) {
1820 for (j = 0; j < nr_node_ids; j++)
1821 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
1822 }
1823}
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
1834{
1835 int i, j = cpu_to_node(cpu);
1836
1837 for (i = 0; i < sched_domains_numa_levels; i++) {
1838 cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
1839 if (cpu < nr_cpu_ids)
1840 return cpu;
1841 }
1842 return nr_cpu_ids;
1843}
1844
1845#endif
1846
1847static int __sdt_alloc(const struct cpumask *cpu_map)
1848{
1849 struct sched_domain_topology_level *tl;
1850 int j;
1851
1852 for_each_sd_topology(tl) {
1853 struct sd_data *sdd = &tl->data;
1854
1855 sdd->sd = alloc_percpu(struct sched_domain *);
1856 if (!sdd->sd)
1857 return -ENOMEM;
1858
1859 sdd->sds = alloc_percpu(struct sched_domain_shared *);
1860 if (!sdd->sds)
1861 return -ENOMEM;
1862
1863 sdd->sg = alloc_percpu(struct sched_group *);
1864 if (!sdd->sg)
1865 return -ENOMEM;
1866
1867 sdd->sgc = alloc_percpu(struct sched_group_capacity *);
1868 if (!sdd->sgc)
1869 return -ENOMEM;
1870
1871 for_each_cpu(j, cpu_map) {
1872 struct sched_domain *sd;
1873 struct sched_domain_shared *sds;
1874 struct sched_group *sg;
1875 struct sched_group_capacity *sgc;
1876
1877 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
1878 GFP_KERNEL, cpu_to_node(j));
1879 if (!sd)
1880 return -ENOMEM;
1881
1882 *per_cpu_ptr(sdd->sd, j) = sd;
1883
1884 sds = kzalloc_node(sizeof(struct sched_domain_shared),
1885 GFP_KERNEL, cpu_to_node(j));
1886 if (!sds)
1887 return -ENOMEM;
1888
1889 *per_cpu_ptr(sdd->sds, j) = sds;
1890
1891 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
1892 GFP_KERNEL, cpu_to_node(j));
1893 if (!sg)
1894 return -ENOMEM;
1895
1896 sg->next = sg;
1897
1898 *per_cpu_ptr(sdd->sg, j) = sg;
1899
1900 sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
1901 GFP_KERNEL, cpu_to_node(j));
1902 if (!sgc)
1903 return -ENOMEM;
1904
1905#ifdef CONFIG_SCHED_DEBUG
1906 sgc->id = j;
1907#endif
1908
1909 *per_cpu_ptr(sdd->sgc, j) = sgc;
1910 }
1911 }
1912
1913 return 0;
1914}
1915
1916static void __sdt_free(const struct cpumask *cpu_map)
1917{
1918 struct sched_domain_topology_level *tl;
1919 int j;
1920
1921 for_each_sd_topology(tl) {
1922 struct sd_data *sdd = &tl->data;
1923
1924 for_each_cpu(j, cpu_map) {
1925 struct sched_domain *sd;
1926
1927 if (sdd->sd) {
1928 sd = *per_cpu_ptr(sdd->sd, j);
1929 if (sd && (sd->flags & SD_OVERLAP))
1930 free_sched_groups(sd->groups, 0);
1931 kfree(*per_cpu_ptr(sdd->sd, j));
1932 }
1933
1934 if (sdd->sds)
1935 kfree(*per_cpu_ptr(sdd->sds, j));
1936 if (sdd->sg)
1937 kfree(*per_cpu_ptr(sdd->sg, j));
1938 if (sdd->sgc)
1939 kfree(*per_cpu_ptr(sdd->sgc, j));
1940 }
1941 free_percpu(sdd->sd);
1942 sdd->sd = NULL;
1943 free_percpu(sdd->sds);
1944 sdd->sds = NULL;
1945 free_percpu(sdd->sg);
1946 sdd->sg = NULL;
1947 free_percpu(sdd->sgc);
1948 sdd->sgc = NULL;
1949 }
1950}
1951
1952static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
1953 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
1954 struct sched_domain *child, int dflags, int cpu)
1955{
1956 struct sched_domain *sd = sd_init(tl, cpu_map, child, dflags, cpu);
1957
1958 if (child) {
1959 sd->level = child->level + 1;
1960 sched_domain_level_max = max(sched_domain_level_max, sd->level);
1961 child->parent = sd;
1962
1963 if (!cpumask_subset(sched_domain_span(child),
1964 sched_domain_span(sd))) {
1965 pr_err("BUG: arch topology borken\n");
1966#ifdef CONFIG_SCHED_DEBUG
1967 pr_err(" the %s domain not a subset of the %s domain\n",
1968 child->name, sd->name);
1969#endif
1970
1971 cpumask_or(sched_domain_span(sd),
1972 sched_domain_span(sd),
1973 sched_domain_span(child));
1974 }
1975
1976 }
1977 set_domain_attribute(sd, attr);
1978
1979 return sd;
1980}
1981
1982
1983
1984
1985
1986static bool topology_span_sane(struct sched_domain_topology_level *tl,
1987 const struct cpumask *cpu_map, int cpu)
1988{
1989 int i;
1990
1991
1992 if (tl->flags & SDTL_OVERLAP)
1993 return true;
1994
1995
1996
1997
1998
1999
2000
2001 for_each_cpu(i, cpu_map) {
2002 if (i == cpu)
2003 continue;
2004
2005
2006
2007
2008
2009
2010 if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
2011 cpumask_intersects(tl->mask(cpu), tl->mask(i)))
2012 return false;
2013 }
2014
2015 return true;
2016}
2017
2018
2019
2020
2021
2022static struct sched_domain_topology_level
2023*asym_cpu_capacity_level(const struct cpumask *cpu_map)
2024{
2025 int i, j, asym_level = 0;
2026 bool asym = false;
2027 struct sched_domain_topology_level *tl, *asym_tl = NULL;
2028 unsigned long cap;
2029
2030
2031 cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
2032
2033 for_each_cpu(i, cpu_map) {
2034 if (arch_scale_cpu_capacity(i) != cap) {
2035 asym = true;
2036 break;
2037 }
2038 }
2039
2040 if (!asym)
2041 return NULL;
2042
2043
2044
2045
2046
2047
2048 for_each_cpu(i, cpu_map) {
2049 unsigned long max_capacity = arch_scale_cpu_capacity(i);
2050 int tl_id = 0;
2051
2052 for_each_sd_topology(tl) {
2053 if (tl_id < asym_level)
2054 goto next_level;
2055
2056 for_each_cpu_and(j, tl->mask(i), cpu_map) {
2057 unsigned long capacity;
2058
2059 capacity = arch_scale_cpu_capacity(j);
2060
2061 if (capacity <= max_capacity)
2062 continue;
2063
2064 max_capacity = capacity;
2065 asym_level = tl_id;
2066 asym_tl = tl;
2067 }
2068next_level:
2069 tl_id++;
2070 }
2071 }
2072
2073 return asym_tl;
2074}
2075
2076
2077
2078
2079
2080
2081static int
2082build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
2083{
2084 enum s_alloc alloc_state = sa_none;
2085 struct sched_domain *sd;
2086 struct s_data d;
2087 struct rq *rq = NULL;
2088 int i, ret = -ENOMEM;
2089 struct sched_domain_topology_level *tl_asym;
2090 bool has_asym = false;
2091
2092 if (WARN_ON(cpumask_empty(cpu_map)))
2093 goto error;
2094
2095 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
2096 if (alloc_state != sa_rootdomain)
2097 goto error;
2098
2099 tl_asym = asym_cpu_capacity_level(cpu_map);
2100
2101
2102 for_each_cpu(i, cpu_map) {
2103 struct sched_domain_topology_level *tl;
2104
2105 sd = NULL;
2106 for_each_sd_topology(tl) {
2107 int dflags = 0;
2108
2109 if (tl == tl_asym) {
2110 dflags |= SD_ASYM_CPUCAPACITY;
2111 has_asym = true;
2112 }
2113
2114 if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
2115 goto error;
2116
2117 sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
2118
2119 if (tl == sched_domain_topology)
2120 *per_cpu_ptr(d.sd, i) = sd;
2121 if (tl->flags & SDTL_OVERLAP)
2122 sd->flags |= SD_OVERLAP;
2123 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
2124 break;
2125 }
2126 }
2127
2128
2129 for_each_cpu(i, cpu_map) {
2130 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
2131 sd->span_weight = cpumask_weight(sched_domain_span(sd));
2132 if (sd->flags & SD_OVERLAP) {
2133 if (build_overlap_sched_groups(sd, i))
2134 goto error;
2135 } else {
2136 if (build_sched_groups(sd, i))
2137 goto error;
2138 }
2139 }
2140 }
2141
2142
2143 for (i = nr_cpumask_bits-1; i >= 0; i--) {
2144 if (!cpumask_test_cpu(i, cpu_map))
2145 continue;
2146
2147 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
2148 claim_allocations(i, sd);
2149 init_sched_groups_capacity(i, sd);
2150 }
2151 }
2152
2153
2154 rcu_read_lock();
2155 for_each_cpu(i, cpu_map) {
2156 rq = cpu_rq(i);
2157 sd = *per_cpu_ptr(d.sd, i);
2158
2159
2160 if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
2161 WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
2162
2163 cpu_attach_domain(sd, d.rd, i);
2164 }
2165 rcu_read_unlock();
2166
2167 if (has_asym)
2168 static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
2169
2170 if (rq && sched_debug_enabled) {
2171 pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
2172 cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
2173 }
2174
2175 ret = 0;
2176error:
2177 __free_domain_allocs(&d, alloc_state, cpu_map);
2178
2179 return ret;
2180}
2181
2182
2183static cpumask_var_t *doms_cur;
2184
2185
2186static int ndoms_cur;
2187
2188
2189static struct sched_domain_attr *dattr_cur;
2190
2191
2192
2193
2194
2195
2196static cpumask_var_t fallback_doms;
2197
2198
2199
2200
2201
2202
2203int __weak arch_update_cpu_topology(void)
2204{
2205 return 0;
2206}
2207
2208cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
2209{
2210 int i;
2211 cpumask_var_t *doms;
2212
2213 doms = kmalloc_array(ndoms, sizeof(*doms), GFP_KERNEL);
2214 if (!doms)
2215 return NULL;
2216 for (i = 0; i < ndoms; i++) {
2217 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
2218 free_sched_domains(doms, i);
2219 return NULL;
2220 }
2221 }
2222 return doms;
2223}
2224
2225void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
2226{
2227 unsigned int i;
2228 for (i = 0; i < ndoms; i++)
2229 free_cpumask_var(doms[i]);
2230 kfree(doms);
2231}
2232
2233
2234
2235
2236
2237
2238int sched_init_domains(const struct cpumask *cpu_map)
2239{
2240 int err;
2241
2242 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
2243 zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
2244 zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
2245
2246 arch_update_cpu_topology();
2247 ndoms_cur = 1;
2248 doms_cur = alloc_sched_domains(ndoms_cur);
2249 if (!doms_cur)
2250 doms_cur = &fallback_doms;
2251 cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
2252 err = build_sched_domains(doms_cur[0], NULL);
2253 register_sched_domain_sysctl();
2254
2255 return err;
2256}
2257
2258
2259
2260
2261
2262static void detach_destroy_domains(const struct cpumask *cpu_map)
2263{
2264 unsigned int cpu = cpumask_any(cpu_map);
2265 int i;
2266
2267 if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
2268 static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
2269
2270 rcu_read_lock();
2271 for_each_cpu(i, cpu_map)
2272 cpu_attach_domain(NULL, &def_root_domain, i);
2273 rcu_read_unlock();
2274}
2275
2276
2277static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
2278 struct sched_domain_attr *new, int idx_new)
2279{
2280 struct sched_domain_attr tmp;
2281
2282
2283 if (!new && !cur)
2284 return 1;
2285
2286 tmp = SD_ATTR_INIT;
2287
2288 return !memcmp(cur ? (cur + idx_cur) : &tmp,
2289 new ? (new + idx_new) : &tmp,
2290 sizeof(struct sched_domain_attr));
2291}
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
2320 struct sched_domain_attr *dattr_new)
2321{
2322 bool __maybe_unused has_eas = false;
2323 int i, j, n;
2324 int new_topology;
2325
2326 lockdep_assert_held(&sched_domains_mutex);
2327
2328
2329 unregister_sched_domain_sysctl();
2330
2331
2332 new_topology = arch_update_cpu_topology();
2333
2334 if (!doms_new) {
2335 WARN_ON_ONCE(dattr_new);
2336 n = 0;
2337 doms_new = alloc_sched_domains(1);
2338 if (doms_new) {
2339 n = 1;
2340 cpumask_and(doms_new[0], cpu_active_mask,
2341 housekeeping_cpumask(HK_FLAG_DOMAIN));
2342 }
2343 } else {
2344 n = ndoms_new;
2345 }
2346
2347
2348 for (i = 0; i < ndoms_cur; i++) {
2349 for (j = 0; j < n && !new_topology; j++) {
2350 if (cpumask_equal(doms_cur[i], doms_new[j]) &&
2351 dattrs_equal(dattr_cur, i, dattr_new, j)) {
2352 struct root_domain *rd;
2353
2354
2355
2356
2357
2358
2359
2360 rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
2361 dl_clear_root_domain(rd);
2362 goto match1;
2363 }
2364 }
2365
2366 detach_destroy_domains(doms_cur[i]);
2367match1:
2368 ;
2369 }
2370
2371 n = ndoms_cur;
2372 if (!doms_new) {
2373 n = 0;
2374 doms_new = &fallback_doms;
2375 cpumask_and(doms_new[0], cpu_active_mask,
2376 housekeeping_cpumask(HK_FLAG_DOMAIN));
2377 }
2378
2379
2380 for (i = 0; i < ndoms_new; i++) {
2381 for (j = 0; j < n && !new_topology; j++) {
2382 if (cpumask_equal(doms_new[i], doms_cur[j]) &&
2383 dattrs_equal(dattr_new, i, dattr_cur, j))
2384 goto match2;
2385 }
2386
2387 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
2388match2:
2389 ;
2390 }
2391
2392#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
2393
2394 for (i = 0; i < ndoms_new; i++) {
2395 for (j = 0; j < n && !sched_energy_update; j++) {
2396 if (cpumask_equal(doms_new[i], doms_cur[j]) &&
2397 cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {
2398 has_eas = true;
2399 goto match3;
2400 }
2401 }
2402
2403 has_eas |= build_perf_domains(doms_new[i]);
2404match3:
2405 ;
2406 }
2407 sched_energy_set(has_eas);
2408#endif
2409
2410
2411 if (doms_cur != &fallback_doms)
2412 free_sched_domains(doms_cur, ndoms_cur);
2413
2414 kfree(dattr_cur);
2415 doms_cur = doms_new;
2416 dattr_cur = dattr_new;
2417 ndoms_cur = ndoms_new;
2418
2419 register_sched_domain_sysctl();
2420}
2421
2422
2423
2424
2425void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
2426 struct sched_domain_attr *dattr_new)
2427{
2428 mutex_lock(&sched_domains_mutex);
2429 partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
2430 mutex_unlock(&sched_domains_mutex);
2431}
2432