1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23#include "sched.h"
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38unsigned int sysctl_sched_latency = 6000000ULL;
39static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
40
41
42
43
44
45
46
47
48
49
50
51
52unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
53
54
55
56
57
58
59unsigned int sysctl_sched_min_granularity = 750000ULL;
60static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
61
62
63
64
65static unsigned int sched_nr_latency = 8;
66
67
68
69
70
71unsigned int sysctl_sched_child_runs_first __read_mostly;
72
73
74
75
76
77
78
79
80
81
82unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
83static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
84
85const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
86
87int sched_thermal_decay_shift;
88static int __init setup_sched_thermal_decay_shift(char *str)
89{
90 int _shift = 0;
91
92 if (kstrtoint(str, 0, &_shift))
93 pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
94
95 sched_thermal_decay_shift = clamp(_shift, 0, 10);
96 return 1;
97}
98__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
99
100#ifdef CONFIG_SMP
101
102
103
104int __weak arch_asym_cpu_priority(int cpu)
105{
106 return -cpu;
107}
108
109
110
111
112
113
114#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
115
116
117
118
119
120
121
122#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
123#endif
124
125#ifdef CONFIG_CFS_BANDWIDTH
126
127
128
129
130
131
132
133
134
135
136unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
137#endif
138
139static inline void update_load_add(struct load_weight *lw, unsigned long inc)
140{
141 lw->weight += inc;
142 lw->inv_weight = 0;
143}
144
145static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
146{
147 lw->weight -= dec;
148 lw->inv_weight = 0;
149}
150
151static inline void update_load_set(struct load_weight *lw, unsigned long w)
152{
153 lw->weight = w;
154 lw->inv_weight = 0;
155}
156
157
158
159
160
161
162
163
164
165
166static unsigned int get_update_sysctl_factor(void)
167{
168 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
169 unsigned int factor;
170
171 switch (sysctl_sched_tunable_scaling) {
172 case SCHED_TUNABLESCALING_NONE:
173 factor = 1;
174 break;
175 case SCHED_TUNABLESCALING_LINEAR:
176 factor = cpus;
177 break;
178 case SCHED_TUNABLESCALING_LOG:
179 default:
180 factor = 1 + ilog2(cpus);
181 break;
182 }
183
184 return factor;
185}
186
187static void update_sysctl(void)
188{
189 unsigned int factor = get_update_sysctl_factor();
190
191#define SET_SYSCTL(name) \
192 (sysctl_##name = (factor) * normalized_sysctl_##name)
193 SET_SYSCTL(sched_min_granularity);
194 SET_SYSCTL(sched_latency);
195 SET_SYSCTL(sched_wakeup_granularity);
196#undef SET_SYSCTL
197}
198
199void __init sched_init_granularity(void)
200{
201 update_sysctl();
202}
203
204#define WMULT_CONST (~0U)
205#define WMULT_SHIFT 32
206
207static void __update_inv_weight(struct load_weight *lw)
208{
209 unsigned long w;
210
211 if (likely(lw->inv_weight))
212 return;
213
214 w = scale_load_down(lw->weight);
215
216 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
217 lw->inv_weight = 1;
218 else if (unlikely(!w))
219 lw->inv_weight = WMULT_CONST;
220 else
221 lw->inv_weight = WMULT_CONST / w;
222}
223
224
225
226
227
228
229
230
231
232
233
234
235
236static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
237{
238 u64 fact = scale_load_down(weight);
239 u32 fact_hi = (u32)(fact >> 32);
240 int shift = WMULT_SHIFT;
241 int fs;
242
243 __update_inv_weight(lw);
244
245 if (unlikely(fact_hi)) {
246 fs = fls(fact_hi);
247 shift -= fs;
248 fact >>= fs;
249 }
250
251 fact = mul_u32_u32(fact, lw->inv_weight);
252
253 fact_hi = (u32)(fact >> 32);
254 if (fact_hi) {
255 fs = fls(fact_hi);
256 shift -= fs;
257 fact >>= fs;
258 }
259
260 return mul_u64_u32_shr(delta_exec, fact, shift);
261}
262
263
264const struct sched_class fair_sched_class;
265
266
267
268
269
270#ifdef CONFIG_FAIR_GROUP_SCHED
271
272
273#define for_each_sched_entity(se) \
274 for (; se; se = se->parent)
275
276static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
277{
278 if (!path)
279 return;
280
281 if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
282 autogroup_path(cfs_rq->tg, path, len);
283 else if (cfs_rq && cfs_rq->tg->css.cgroup)
284 cgroup_path(cfs_rq->tg->css.cgroup, path, len);
285 else
286 strlcpy(path, "(null)", len);
287}
288
289static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
290{
291 struct rq *rq = rq_of(cfs_rq);
292 int cpu = cpu_of(rq);
293
294 if (cfs_rq->on_list)
295 return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
296
297 cfs_rq->on_list = 1;
298
299
300
301
302
303
304
305
306
307
308 if (cfs_rq->tg->parent &&
309 cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
310
311
312
313
314
315
316 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
317 &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
318
319
320
321
322
323 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
324 return true;
325 }
326
327 if (!cfs_rq->tg->parent) {
328
329
330
331
332 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
333 &rq->leaf_cfs_rq_list);
334
335
336
337
338 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
339 return true;
340 }
341
342
343
344
345
346
347
348 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
349
350
351
352
353 rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
354 return false;
355}
356
357static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
358{
359 if (cfs_rq->on_list) {
360 struct rq *rq = rq_of(cfs_rq);
361
362
363
364
365
366
367
368
369 if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
370 rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
371
372 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
373 cfs_rq->on_list = 0;
374 }
375}
376
377static inline void assert_list_leaf_cfs_rq(struct rq *rq)
378{
379 SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
380}
381
382
383#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
384 list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
385 leaf_cfs_rq_list)
386
387
388static inline struct cfs_rq *
389is_same_group(struct sched_entity *se, struct sched_entity *pse)
390{
391 if (se->cfs_rq == pse->cfs_rq)
392 return se->cfs_rq;
393
394 return NULL;
395}
396
397static inline struct sched_entity *parent_entity(struct sched_entity *se)
398{
399 return se->parent;
400}
401
402static void
403find_matching_se(struct sched_entity **se, struct sched_entity **pse)
404{
405 int se_depth, pse_depth;
406
407
408
409
410
411
412
413
414
415 se_depth = (*se)->depth;
416 pse_depth = (*pse)->depth;
417
418 while (se_depth > pse_depth) {
419 se_depth--;
420 *se = parent_entity(*se);
421 }
422
423 while (pse_depth > se_depth) {
424 pse_depth--;
425 *pse = parent_entity(*pse);
426 }
427
428 while (!is_same_group(*se, *pse)) {
429 *se = parent_entity(*se);
430 *pse = parent_entity(*pse);
431 }
432}
433
434#else
435
436#define for_each_sched_entity(se) \
437 for (; se; se = NULL)
438
439static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
440{
441 if (path)
442 strlcpy(path, "(null)", len);
443}
444
445static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
446{
447 return true;
448}
449
450static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
451{
452}
453
454static inline void assert_list_leaf_cfs_rq(struct rq *rq)
455{
456}
457
458#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
459 for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
460
461static inline struct sched_entity *parent_entity(struct sched_entity *se)
462{
463 return NULL;
464}
465
466static inline void
467find_matching_se(struct sched_entity **se, struct sched_entity **pse)
468{
469}
470
471#endif
472
473static __always_inline
474void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
475
476
477
478
479
480static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
481{
482 s64 delta = (s64)(vruntime - max_vruntime);
483 if (delta > 0)
484 max_vruntime = vruntime;
485
486 return max_vruntime;
487}
488
489static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
490{
491 s64 delta = (s64)(vruntime - min_vruntime);
492 if (delta < 0)
493 min_vruntime = vruntime;
494
495 return min_vruntime;
496}
497
498static inline bool entity_before(struct sched_entity *a,
499 struct sched_entity *b)
500{
501 return (s64)(a->vruntime - b->vruntime) < 0;
502}
503
504#define __node_2_se(node) \
505 rb_entry((node), struct sched_entity, run_node)
506
507static void update_min_vruntime(struct cfs_rq *cfs_rq)
508{
509 struct sched_entity *curr = cfs_rq->curr;
510 struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
511
512 u64 vruntime = cfs_rq->min_vruntime;
513
514 if (curr) {
515 if (curr->on_rq)
516 vruntime = curr->vruntime;
517 else
518 curr = NULL;
519 }
520
521 if (leftmost) {
522 struct sched_entity *se = __node_2_se(leftmost);
523
524 if (!curr)
525 vruntime = se->vruntime;
526 else
527 vruntime = min_vruntime(vruntime, se->vruntime);
528 }
529
530
531 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
532#ifndef CONFIG_64BIT
533 smp_wmb();
534 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
535#endif
536}
537
538static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
539{
540 return entity_before(__node_2_se(a), __node_2_se(b));
541}
542
543
544
545
546static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
547{
548 rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
549}
550
551static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
552{
553 rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
554}
555
556struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
557{
558 struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
559
560 if (!left)
561 return NULL;
562
563 return __node_2_se(left);
564}
565
566static struct sched_entity *__pick_next_entity(struct sched_entity *se)
567{
568 struct rb_node *next = rb_next(&se->run_node);
569
570 if (!next)
571 return NULL;
572
573 return __node_2_se(next);
574}
575
576#ifdef CONFIG_SCHED_DEBUG
577struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
578{
579 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
580
581 if (!last)
582 return NULL;
583
584 return __node_2_se(last);
585}
586
587
588
589
590
591int sched_update_scaling(void)
592{
593 unsigned int factor = get_update_sysctl_factor();
594
595 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
596 sysctl_sched_min_granularity);
597
598#define WRT_SYSCTL(name) \
599 (normalized_sysctl_##name = sysctl_##name / (factor))
600 WRT_SYSCTL(sched_min_granularity);
601 WRT_SYSCTL(sched_latency);
602 WRT_SYSCTL(sched_wakeup_granularity);
603#undef WRT_SYSCTL
604
605 return 0;
606}
607#endif
608
609
610
611
612static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
613{
614 if (unlikely(se->load.weight != NICE_0_LOAD))
615 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
616
617 return delta;
618}
619
620
621
622
623
624
625
626
627
628static u64 __sched_period(unsigned long nr_running)
629{
630 if (unlikely(nr_running > sched_nr_latency))
631 return nr_running * sysctl_sched_min_granularity;
632 else
633 return sysctl_sched_latency;
634}
635
636
637
638
639
640
641
642static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
643{
644 unsigned int nr_running = cfs_rq->nr_running;
645 u64 slice;
646
647 if (sched_feat(ALT_PERIOD))
648 nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
649
650 slice = __sched_period(nr_running + !se->on_rq);
651
652 for_each_sched_entity(se) {
653 struct load_weight *load;
654 struct load_weight lw;
655
656 cfs_rq = cfs_rq_of(se);
657 load = &cfs_rq->load;
658
659 if (unlikely(!se->on_rq)) {
660 lw = cfs_rq->load;
661
662 update_load_add(&lw, se->load.weight);
663 load = &lw;
664 }
665 slice = __calc_delta(slice, se->load.weight, load);
666 }
667
668 if (sched_feat(BASE_SLICE))
669 slice = max(slice, (u64)sysctl_sched_min_granularity);
670
671 return slice;
672}
673
674
675
676
677
678
679static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
680{
681 return calc_delta_fair(sched_slice(cfs_rq, se), se);
682}
683
684#include "pelt.h"
685#ifdef CONFIG_SMP
686
687static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
688static unsigned long task_h_load(struct task_struct *p);
689static unsigned long capacity_of(int cpu);
690
691
692void init_entity_runnable_average(struct sched_entity *se)
693{
694 struct sched_avg *sa = &se->avg;
695
696 memset(sa, 0, sizeof(*sa));
697
698
699
700
701
702
703
704 if (entity_is_task(se))
705 sa->load_avg = scale_load_down(se->load.weight);
706
707
708}
709
710static void attach_entity_cfs_rq(struct sched_entity *se);
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738void post_init_entity_util_avg(struct task_struct *p)
739{
740 struct sched_entity *se = &p->se;
741 struct cfs_rq *cfs_rq = cfs_rq_of(se);
742 struct sched_avg *sa = &se->avg;
743 long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
744 long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
745
746 if (cap > 0) {
747 if (cfs_rq->avg.util_avg != 0) {
748 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
749 sa->util_avg /= (cfs_rq->avg.load_avg + 1);
750
751 if (sa->util_avg > cap)
752 sa->util_avg = cap;
753 } else {
754 sa->util_avg = cap;
755 }
756 }
757
758 sa->runnable_avg = sa->util_avg;
759
760 if (p->sched_class != &fair_sched_class) {
761
762
763
764
765
766
767
768
769
770
771 se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
772 return;
773 }
774
775 attach_entity_cfs_rq(se);
776}
777
778#else
779void init_entity_runnable_average(struct sched_entity *se)
780{
781}
782void post_init_entity_util_avg(struct task_struct *p)
783{
784}
785static void update_tg_load_avg(struct cfs_rq *cfs_rq)
786{
787}
788#endif
789
790
791
792
793static void update_curr(struct cfs_rq *cfs_rq)
794{
795 struct sched_entity *curr = cfs_rq->curr;
796 u64 now = rq_clock_task(rq_of(cfs_rq));
797 u64 delta_exec;
798
799 if (unlikely(!curr))
800 return;
801
802 delta_exec = now - curr->exec_start;
803 if (unlikely((s64)delta_exec <= 0))
804 return;
805
806 curr->exec_start = now;
807
808 schedstat_set(curr->statistics.exec_max,
809 max(delta_exec, curr->statistics.exec_max));
810
811 curr->sum_exec_runtime += delta_exec;
812 schedstat_add(cfs_rq->exec_clock, delta_exec);
813
814 curr->vruntime += calc_delta_fair(delta_exec, curr);
815 update_min_vruntime(cfs_rq);
816
817 if (entity_is_task(curr)) {
818 struct task_struct *curtask = task_of(curr);
819
820 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
821 cgroup_account_cputime(curtask, delta_exec);
822 account_group_exec_runtime(curtask, delta_exec);
823 }
824
825 account_cfs_rq_runtime(cfs_rq, delta_exec);
826}
827
828static void update_curr_fair(struct rq *rq)
829{
830 update_curr(cfs_rq_of(&rq->curr->se));
831}
832
833static inline void
834update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
835{
836 u64 wait_start, prev_wait_start;
837
838 if (!schedstat_enabled())
839 return;
840
841 wait_start = rq_clock(rq_of(cfs_rq));
842 prev_wait_start = schedstat_val(se->statistics.wait_start);
843
844 if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
845 likely(wait_start > prev_wait_start))
846 wait_start -= prev_wait_start;
847
848 __schedstat_set(se->statistics.wait_start, wait_start);
849}
850
851static inline void
852update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
853{
854 struct task_struct *p;
855 u64 delta;
856
857 if (!schedstat_enabled())
858 return;
859
860
861
862
863
864
865
866 if (unlikely(!schedstat_val(se->statistics.wait_start)))
867 return;
868
869 delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
870
871 if (entity_is_task(se)) {
872 p = task_of(se);
873 if (task_on_rq_migrating(p)) {
874
875
876
877
878
879 __schedstat_set(se->statistics.wait_start, delta);
880 return;
881 }
882 trace_sched_stat_wait(p, delta);
883 }
884
885 __schedstat_set(se->statistics.wait_max,
886 max(schedstat_val(se->statistics.wait_max), delta));
887 __schedstat_inc(se->statistics.wait_count);
888 __schedstat_add(se->statistics.wait_sum, delta);
889 __schedstat_set(se->statistics.wait_start, 0);
890}
891
892static inline void
893update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
894{
895 struct task_struct *tsk = NULL;
896 u64 sleep_start, block_start;
897
898 if (!schedstat_enabled())
899 return;
900
901 sleep_start = schedstat_val(se->statistics.sleep_start);
902 block_start = schedstat_val(se->statistics.block_start);
903
904 if (entity_is_task(se))
905 tsk = task_of(se);
906
907 if (sleep_start) {
908 u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
909
910 if ((s64)delta < 0)
911 delta = 0;
912
913 if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
914 __schedstat_set(se->statistics.sleep_max, delta);
915
916 __schedstat_set(se->statistics.sleep_start, 0);
917 __schedstat_add(se->statistics.sum_sleep_runtime, delta);
918
919 if (tsk) {
920 account_scheduler_latency(tsk, delta >> 10, 1);
921 trace_sched_stat_sleep(tsk, delta);
922 }
923 }
924 if (block_start) {
925 u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
926
927 if ((s64)delta < 0)
928 delta = 0;
929
930 if (unlikely(delta > schedstat_val(se->statistics.block_max)))
931 __schedstat_set(se->statistics.block_max, delta);
932
933 __schedstat_set(se->statistics.block_start, 0);
934 __schedstat_add(se->statistics.sum_sleep_runtime, delta);
935
936 if (tsk) {
937 if (tsk->in_iowait) {
938 __schedstat_add(se->statistics.iowait_sum, delta);
939 __schedstat_inc(se->statistics.iowait_count);
940 trace_sched_stat_iowait(tsk, delta);
941 }
942
943 trace_sched_stat_blocked(tsk, delta);
944
945
946
947
948
949
950 if (unlikely(prof_on == SLEEP_PROFILING)) {
951 profile_hits(SLEEP_PROFILING,
952 (void *)get_wchan(tsk),
953 delta >> 20);
954 }
955 account_scheduler_latency(tsk, delta >> 10, 0);
956 }
957 }
958}
959
960
961
962
963static inline void
964update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
965{
966 if (!schedstat_enabled())
967 return;
968
969
970
971
972
973 if (se != cfs_rq->curr)
974 update_stats_wait_start(cfs_rq, se);
975
976 if (flags & ENQUEUE_WAKEUP)
977 update_stats_enqueue_sleeper(cfs_rq, se);
978}
979
980static inline void
981update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
982{
983
984 if (!schedstat_enabled())
985 return;
986
987
988
989
990
991 if (se != cfs_rq->curr)
992 update_stats_wait_end(cfs_rq, se);
993
994 if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
995 struct task_struct *tsk = task_of(se);
996 unsigned int state;
997
998
999 state = READ_ONCE(tsk->__state);
1000 if (state & TASK_INTERRUPTIBLE)
1001 __schedstat_set(se->statistics.sleep_start,
1002 rq_clock(rq_of(cfs_rq)));
1003 if (state & TASK_UNINTERRUPTIBLE)
1004 __schedstat_set(se->statistics.block_start,
1005 rq_clock(rq_of(cfs_rq)));
1006 }
1007}
1008
1009
1010
1011
1012static inline void
1013update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
1014{
1015
1016
1017
1018 se->exec_start = rq_clock_task(rq_of(cfs_rq));
1019}
1020
1021
1022
1023
1024
1025#ifdef CONFIG_NUMA_BALANCING
1026
1027
1028
1029
1030
1031unsigned int sysctl_numa_balancing_scan_period_min = 1000;
1032unsigned int sysctl_numa_balancing_scan_period_max = 60000;
1033
1034
1035unsigned int sysctl_numa_balancing_scan_size = 256;
1036
1037
1038unsigned int sysctl_numa_balancing_scan_delay = 1000;
1039
1040struct numa_group {
1041 refcount_t refcount;
1042
1043 spinlock_t lock;
1044 int nr_tasks;
1045 pid_t gid;
1046 int active_nodes;
1047
1048 struct rcu_head rcu;
1049 unsigned long total_faults;
1050 unsigned long max_faults_cpu;
1051
1052
1053
1054
1055
1056 unsigned long *faults_cpu;
1057 unsigned long faults[];
1058};
1059
1060
1061
1062
1063
1064static struct numa_group *deref_task_numa_group(struct task_struct *p)
1065{
1066 return rcu_dereference_check(p->numa_group, p == current ||
1067 (lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu)));
1068}
1069
1070static struct numa_group *deref_curr_numa_group(struct task_struct *p)
1071{
1072 return rcu_dereference_protected(p->numa_group, p == current);
1073}
1074
1075static inline unsigned long group_faults_priv(struct numa_group *ng);
1076static inline unsigned long group_faults_shared(struct numa_group *ng);
1077
1078static unsigned int task_nr_scan_windows(struct task_struct *p)
1079{
1080 unsigned long rss = 0;
1081 unsigned long nr_scan_pages;
1082
1083
1084
1085
1086
1087
1088 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1089 rss = get_mm_rss(p->mm);
1090 if (!rss)
1091 rss = nr_scan_pages;
1092
1093 rss = round_up(rss, nr_scan_pages);
1094 return rss / nr_scan_pages;
1095}
1096
1097
1098#define MAX_SCAN_WINDOW 2560
1099
1100static unsigned int task_scan_min(struct task_struct *p)
1101{
1102 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
1103 unsigned int scan, floor;
1104 unsigned int windows = 1;
1105
1106 if (scan_size < MAX_SCAN_WINDOW)
1107 windows = MAX_SCAN_WINDOW / scan_size;
1108 floor = 1000 / windows;
1109
1110 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1111 return max_t(unsigned int, floor, scan);
1112}
1113
1114static unsigned int task_scan_start(struct task_struct *p)
1115{
1116 unsigned long smin = task_scan_min(p);
1117 unsigned long period = smin;
1118 struct numa_group *ng;
1119
1120
1121 rcu_read_lock();
1122 ng = rcu_dereference(p->numa_group);
1123 if (ng) {
1124 unsigned long shared = group_faults_shared(ng);
1125 unsigned long private = group_faults_priv(ng);
1126
1127 period *= refcount_read(&ng->refcount);
1128 period *= shared + 1;
1129 period /= private + shared + 1;
1130 }
1131 rcu_read_unlock();
1132
1133 return max(smin, period);
1134}
1135
1136static unsigned int task_scan_max(struct task_struct *p)
1137{
1138 unsigned long smin = task_scan_min(p);
1139 unsigned long smax;
1140 struct numa_group *ng;
1141
1142
1143 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1144
1145
1146 ng = deref_curr_numa_group(p);
1147 if (ng) {
1148 unsigned long shared = group_faults_shared(ng);
1149 unsigned long private = group_faults_priv(ng);
1150 unsigned long period = smax;
1151
1152 period *= refcount_read(&ng->refcount);
1153 period *= shared + 1;
1154 period /= private + shared + 1;
1155
1156 smax = max(smax, period);
1157 }
1158
1159 return max(smin, smax);
1160}
1161
1162static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1163{
1164 rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
1165 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1166}
1167
1168static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1169{
1170 rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
1171 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1172}
1173
1174
1175#define NR_NUMA_HINT_FAULT_TYPES 2
1176
1177
1178#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1179
1180
1181#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1182
1183pid_t task_numa_group_id(struct task_struct *p)
1184{
1185 struct numa_group *ng;
1186 pid_t gid = 0;
1187
1188 rcu_read_lock();
1189 ng = rcu_dereference(p->numa_group);
1190 if (ng)
1191 gid = ng->gid;
1192 rcu_read_unlock();
1193
1194 return gid;
1195}
1196
1197
1198
1199
1200
1201
1202
1203static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1204{
1205 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1206}
1207
1208static inline unsigned long task_faults(struct task_struct *p, int nid)
1209{
1210 if (!p->numa_faults)
1211 return 0;
1212
1213 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1214 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1215}
1216
1217static inline unsigned long group_faults(struct task_struct *p, int nid)
1218{
1219 struct numa_group *ng = deref_task_numa_group(p);
1220
1221 if (!ng)
1222 return 0;
1223
1224 return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1225 ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1226}
1227
1228static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1229{
1230 return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1231 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
1232}
1233
1234static inline unsigned long group_faults_priv(struct numa_group *ng)
1235{
1236 unsigned long faults = 0;
1237 int node;
1238
1239 for_each_online_node(node) {
1240 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
1241 }
1242
1243 return faults;
1244}
1245
1246static inline unsigned long group_faults_shared(struct numa_group *ng)
1247{
1248 unsigned long faults = 0;
1249 int node;
1250
1251 for_each_online_node(node) {
1252 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
1253 }
1254
1255 return faults;
1256}
1257
1258
1259
1260
1261
1262
1263#define ACTIVE_NODE_FRACTION 3
1264
1265static bool numa_is_active_node(int nid, struct numa_group *ng)
1266{
1267 return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1268}
1269
1270
1271static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1272 int maxdist, bool task)
1273{
1274 unsigned long score = 0;
1275 int node;
1276
1277
1278
1279
1280
1281 if (sched_numa_topology_type == NUMA_DIRECT)
1282 return 0;
1283
1284
1285
1286
1287
1288 for_each_online_node(node) {
1289 unsigned long faults;
1290 int dist = node_distance(nid, node);
1291
1292
1293
1294
1295
1296 if (dist == sched_max_numa_distance || node == nid)
1297 continue;
1298
1299
1300
1301
1302
1303
1304
1305
1306 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1307 dist >= maxdist)
1308 continue;
1309
1310
1311 if (task)
1312 faults = task_faults(p, node);
1313 else
1314 faults = group_faults(p, node);
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1325 faults *= (sched_max_numa_distance - dist);
1326 faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1327 }
1328
1329 score += faults;
1330 }
1331
1332 return score;
1333}
1334
1335
1336
1337
1338
1339
1340
1341static inline unsigned long task_weight(struct task_struct *p, int nid,
1342 int dist)
1343{
1344 unsigned long faults, total_faults;
1345
1346 if (!p->numa_faults)
1347 return 0;
1348
1349 total_faults = p->total_numa_faults;
1350
1351 if (!total_faults)
1352 return 0;
1353
1354 faults = task_faults(p, nid);
1355 faults += score_nearby_nodes(p, nid, dist, true);
1356
1357 return 1000 * faults / total_faults;
1358}
1359
1360static inline unsigned long group_weight(struct task_struct *p, int nid,
1361 int dist)
1362{
1363 struct numa_group *ng = deref_task_numa_group(p);
1364 unsigned long faults, total_faults;
1365
1366 if (!ng)
1367 return 0;
1368
1369 total_faults = ng->total_faults;
1370
1371 if (!total_faults)
1372 return 0;
1373
1374 faults = group_faults(p, nid);
1375 faults += score_nearby_nodes(p, nid, dist, false);
1376
1377 return 1000 * faults / total_faults;
1378}
1379
1380bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1381 int src_nid, int dst_cpu)
1382{
1383 struct numa_group *ng = deref_curr_numa_group(p);
1384 int dst_nid = cpu_to_node(dst_cpu);
1385 int last_cpupid, this_cpupid;
1386
1387 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1388 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1389
1390
1391
1392
1393
1394
1395
1396 if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
1397 (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
1398 return true;
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417 if (!cpupid_pid_unset(last_cpupid) &&
1418 cpupid_to_nid(last_cpupid) != dst_nid)
1419 return false;
1420
1421
1422 if (cpupid_match_pid(p, last_cpupid))
1423 return true;
1424
1425
1426 if (!ng)
1427 return true;
1428
1429
1430
1431
1432
1433 if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1434 ACTIVE_NODE_FRACTION)
1435 return true;
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445 return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1446 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
1447}
1448
1449
1450
1451
1452enum numa_type {
1453
1454 node_has_spare = 0,
1455
1456
1457
1458
1459 node_fully_busy,
1460
1461
1462
1463
1464 node_overloaded
1465};
1466
1467
1468struct numa_stats {
1469 unsigned long load;
1470 unsigned long runnable;
1471 unsigned long util;
1472
1473 unsigned long compute_capacity;
1474 unsigned int nr_running;
1475 unsigned int weight;
1476 enum numa_type node_type;
1477 int idle_cpu;
1478};
1479
1480static inline bool is_core_idle(int cpu)
1481{
1482#ifdef CONFIG_SCHED_SMT
1483 int sibling;
1484
1485 for_each_cpu(sibling, cpu_smt_mask(cpu)) {
1486 if (cpu == sibling)
1487 continue;
1488
1489 if (!idle_cpu(cpu))
1490 return false;
1491 }
1492#endif
1493
1494 return true;
1495}
1496
1497struct task_numa_env {
1498 struct task_struct *p;
1499
1500 int src_cpu, src_nid;
1501 int dst_cpu, dst_nid;
1502
1503 struct numa_stats src_stats, dst_stats;
1504
1505 int imbalance_pct;
1506 int dist;
1507
1508 struct task_struct *best_task;
1509 long best_imp;
1510 int best_cpu;
1511};
1512
1513static unsigned long cpu_load(struct rq *rq);
1514static unsigned long cpu_runnable(struct rq *rq);
1515static unsigned long cpu_util(int cpu);
1516static inline long adjust_numa_imbalance(int imbalance,
1517 int dst_running, int dst_weight);
1518
1519static inline enum
1520numa_type numa_classify(unsigned int imbalance_pct,
1521 struct numa_stats *ns)
1522{
1523 if ((ns->nr_running > ns->weight) &&
1524 (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
1525 ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
1526 return node_overloaded;
1527
1528 if ((ns->nr_running < ns->weight) ||
1529 (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
1530 ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
1531 return node_has_spare;
1532
1533 return node_fully_busy;
1534}
1535
1536#ifdef CONFIG_SCHED_SMT
1537
1538static inline bool test_idle_cores(int cpu, bool def);
1539static inline int numa_idle_core(int idle_core, int cpu)
1540{
1541 if (!static_branch_likely(&sched_smt_present) ||
1542 idle_core >= 0 || !test_idle_cores(cpu, false))
1543 return idle_core;
1544
1545
1546
1547
1548
1549 if (is_core_idle(cpu))
1550 idle_core = cpu;
1551
1552 return idle_core;
1553}
1554#else
1555static inline int numa_idle_core(int idle_core, int cpu)
1556{
1557 return idle_core;
1558}
1559#endif
1560
1561
1562
1563
1564
1565
1566
1567static void update_numa_stats(struct task_numa_env *env,
1568 struct numa_stats *ns, int nid,
1569 bool find_idle)
1570{
1571 int cpu, idle_core = -1;
1572
1573 memset(ns, 0, sizeof(*ns));
1574 ns->idle_cpu = -1;
1575
1576 rcu_read_lock();
1577 for_each_cpu(cpu, cpumask_of_node(nid)) {
1578 struct rq *rq = cpu_rq(cpu);
1579
1580 ns->load += cpu_load(rq);
1581 ns->runnable += cpu_runnable(rq);
1582 ns->util += cpu_util(cpu);
1583 ns->nr_running += rq->cfs.h_nr_running;
1584 ns->compute_capacity += capacity_of(cpu);
1585
1586 if (find_idle && !rq->nr_running && idle_cpu(cpu)) {
1587 if (READ_ONCE(rq->numa_migrate_on) ||
1588 !cpumask_test_cpu(cpu, env->p->cpus_ptr))
1589 continue;
1590
1591 if (ns->idle_cpu == -1)
1592 ns->idle_cpu = cpu;
1593
1594 idle_core = numa_idle_core(idle_core, cpu);
1595 }
1596 }
1597 rcu_read_unlock();
1598
1599 ns->weight = cpumask_weight(cpumask_of_node(nid));
1600
1601 ns->node_type = numa_classify(env->imbalance_pct, ns);
1602
1603 if (idle_core >= 0)
1604 ns->idle_cpu = idle_core;
1605}
1606
1607static void task_numa_assign(struct task_numa_env *env,
1608 struct task_struct *p, long imp)
1609{
1610 struct rq *rq = cpu_rq(env->dst_cpu);
1611
1612
1613 if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) {
1614 int cpu;
1615 int start = env->dst_cpu;
1616
1617
1618 for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) {
1619 if (cpu == env->best_cpu || !idle_cpu(cpu) ||
1620 !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
1621 continue;
1622 }
1623
1624 env->dst_cpu = cpu;
1625 rq = cpu_rq(env->dst_cpu);
1626 if (!xchg(&rq->numa_migrate_on, 1))
1627 goto assign;
1628 }
1629
1630
1631 return;
1632 }
1633
1634assign:
1635
1636
1637
1638
1639 if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
1640 rq = cpu_rq(env->best_cpu);
1641 WRITE_ONCE(rq->numa_migrate_on, 0);
1642 }
1643
1644 if (env->best_task)
1645 put_task_struct(env->best_task);
1646 if (p)
1647 get_task_struct(p);
1648
1649 env->best_task = p;
1650 env->best_imp = imp;
1651 env->best_cpu = env->dst_cpu;
1652}
1653
1654static bool load_too_imbalanced(long src_load, long dst_load,
1655 struct task_numa_env *env)
1656{
1657 long imb, old_imb;
1658 long orig_src_load, orig_dst_load;
1659 long src_capacity, dst_capacity;
1660
1661
1662
1663
1664
1665
1666
1667
1668 src_capacity = env->src_stats.compute_capacity;
1669 dst_capacity = env->dst_stats.compute_capacity;
1670
1671 imb = abs(dst_load * src_capacity - src_load * dst_capacity);
1672
1673 orig_src_load = env->src_stats.load;
1674 orig_dst_load = env->dst_stats.load;
1675
1676 old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
1677
1678
1679 return (imb > old_imb);
1680}
1681
1682
1683
1684
1685
1686
1687#define SMALLIMP 30
1688
1689
1690
1691
1692
1693
1694
1695static bool task_numa_compare(struct task_numa_env *env,
1696 long taskimp, long groupimp, bool maymove)
1697{
1698 struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
1699 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1700 long imp = p_ng ? groupimp : taskimp;
1701 struct task_struct *cur;
1702 long src_load, dst_load;
1703 int dist = env->dist;
1704 long moveimp = imp;
1705 long load;
1706 bool stopsearch = false;
1707
1708 if (READ_ONCE(dst_rq->numa_migrate_on))
1709 return false;
1710
1711 rcu_read_lock();
1712 cur = rcu_dereference(dst_rq->curr);
1713 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
1714 cur = NULL;
1715
1716
1717
1718
1719
1720 if (cur == env->p) {
1721 stopsearch = true;
1722 goto unlock;
1723 }
1724
1725 if (!cur) {
1726 if (maymove && moveimp >= env->best_imp)
1727 goto assign;
1728 else
1729 goto unlock;
1730 }
1731
1732
1733 if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
1734 goto unlock;
1735
1736
1737
1738
1739
1740 if (env->best_task &&
1741 env->best_task->numa_preferred_nid == env->src_nid &&
1742 cur->numa_preferred_nid != env->src_nid) {
1743 goto unlock;
1744 }
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756 cur_ng = rcu_dereference(cur->numa_group);
1757 if (cur_ng == p_ng) {
1758 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1759 task_weight(cur, env->dst_nid, dist);
1760
1761
1762
1763
1764 if (cur_ng)
1765 imp -= imp / 16;
1766 } else {
1767
1768
1769
1770
1771 if (cur_ng && p_ng)
1772 imp += group_weight(cur, env->src_nid, dist) -
1773 group_weight(cur, env->dst_nid, dist);
1774 else
1775 imp += task_weight(cur, env->src_nid, dist) -
1776 task_weight(cur, env->dst_nid, dist);
1777 }
1778
1779
1780 if (cur->numa_preferred_nid == env->dst_nid)
1781 imp -= imp / 16;
1782
1783
1784
1785
1786
1787
1788
1789 if (cur->numa_preferred_nid == env->src_nid)
1790 imp += imp / 8;
1791
1792 if (maymove && moveimp > imp && moveimp > env->best_imp) {
1793 imp = moveimp;
1794 cur = NULL;
1795 goto assign;
1796 }
1797
1798
1799
1800
1801
1802 if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
1803 env->best_task->numa_preferred_nid != env->src_nid) {
1804 goto assign;
1805 }
1806
1807
1808
1809
1810
1811
1812
1813 if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
1814 goto unlock;
1815
1816
1817
1818
1819 load = task_h_load(env->p) - task_h_load(cur);
1820 if (!load)
1821 goto assign;
1822
1823 dst_load = env->dst_stats.load + load;
1824 src_load = env->src_stats.load - load;
1825
1826 if (load_too_imbalanced(src_load, dst_load, env))
1827 goto unlock;
1828
1829assign:
1830
1831 if (!cur) {
1832 int cpu = env->dst_stats.idle_cpu;
1833
1834
1835 if (cpu < 0)
1836 cpu = env->dst_cpu;
1837
1838
1839
1840
1841
1842 if (!idle_cpu(cpu) && env->best_cpu >= 0 &&
1843 idle_cpu(env->best_cpu)) {
1844 cpu = env->best_cpu;
1845 }
1846
1847 env->dst_cpu = cpu;
1848 }
1849
1850 task_numa_assign(env, cur, imp);
1851
1852
1853
1854
1855
1856
1857 if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu))
1858 stopsearch = true;
1859
1860
1861
1862
1863
1864 if (!maymove && env->best_task &&
1865 env->best_task->numa_preferred_nid == env->src_nid) {
1866 stopsearch = true;
1867 }
1868unlock:
1869 rcu_read_unlock();
1870
1871 return stopsearch;
1872}
1873
1874static void task_numa_find_cpu(struct task_numa_env *env,
1875 long taskimp, long groupimp)
1876{
1877 bool maymove = false;
1878 int cpu;
1879
1880
1881
1882
1883
1884 if (env->dst_stats.node_type == node_has_spare) {
1885 unsigned int imbalance;
1886 int src_running, dst_running;
1887
1888
1889
1890
1891
1892
1893
1894 src_running = env->src_stats.nr_running - 1;
1895 dst_running = env->dst_stats.nr_running + 1;
1896 imbalance = max(0, dst_running - src_running);
1897 imbalance = adjust_numa_imbalance(imbalance, dst_running,
1898 env->dst_stats.weight);
1899
1900
1901 if (!imbalance) {
1902 maymove = true;
1903 if (env->dst_stats.idle_cpu >= 0) {
1904 env->dst_cpu = env->dst_stats.idle_cpu;
1905 task_numa_assign(env, NULL, 0);
1906 return;
1907 }
1908 }
1909 } else {
1910 long src_load, dst_load, load;
1911
1912
1913
1914
1915 load = task_h_load(env->p);
1916 dst_load = env->dst_stats.load + load;
1917 src_load = env->src_stats.load - load;
1918 maymove = !load_too_imbalanced(src_load, dst_load, env);
1919 }
1920
1921 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1922
1923 if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
1924 continue;
1925
1926 env->dst_cpu = cpu;
1927 if (task_numa_compare(env, taskimp, groupimp, maymove))
1928 break;
1929 }
1930}
1931
1932static int task_numa_migrate(struct task_struct *p)
1933{
1934 struct task_numa_env env = {
1935 .p = p,
1936
1937 .src_cpu = task_cpu(p),
1938 .src_nid = task_node(p),
1939
1940 .imbalance_pct = 112,
1941
1942 .best_task = NULL,
1943 .best_imp = 0,
1944 .best_cpu = -1,
1945 };
1946 unsigned long taskweight, groupweight;
1947 struct sched_domain *sd;
1948 long taskimp, groupimp;
1949 struct numa_group *ng;
1950 struct rq *best_rq;
1951 int nid, ret, dist;
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961 rcu_read_lock();
1962 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1963 if (sd)
1964 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1965 rcu_read_unlock();
1966
1967
1968
1969
1970
1971
1972
1973 if (unlikely(!sd)) {
1974 sched_setnuma(p, task_node(p));
1975 return -EINVAL;
1976 }
1977
1978 env.dst_nid = p->numa_preferred_nid;
1979 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1980 taskweight = task_weight(p, env.src_nid, dist);
1981 groupweight = group_weight(p, env.src_nid, dist);
1982 update_numa_stats(&env, &env.src_stats, env.src_nid, false);
1983 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1984 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1985 update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
1986
1987
1988 task_numa_find_cpu(&env, taskimp, groupimp);
1989
1990
1991
1992
1993
1994
1995
1996
1997 ng = deref_curr_numa_group(p);
1998 if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
1999 for_each_online_node(nid) {
2000 if (nid == env.src_nid || nid == p->numa_preferred_nid)
2001 continue;
2002
2003 dist = node_distance(env.src_nid, env.dst_nid);
2004 if (sched_numa_topology_type == NUMA_BACKPLANE &&
2005 dist != env.dist) {
2006 taskweight = task_weight(p, env.src_nid, dist);
2007 groupweight = group_weight(p, env.src_nid, dist);
2008 }
2009
2010
2011 taskimp = task_weight(p, nid, dist) - taskweight;
2012 groupimp = group_weight(p, nid, dist) - groupweight;
2013 if (taskimp < 0 && groupimp < 0)
2014 continue;
2015
2016 env.dist = dist;
2017 env.dst_nid = nid;
2018 update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
2019 task_numa_find_cpu(&env, taskimp, groupimp);
2020 }
2021 }
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031 if (ng) {
2032 if (env.best_cpu == -1)
2033 nid = env.src_nid;
2034 else
2035 nid = cpu_to_node(env.best_cpu);
2036
2037 if (nid != p->numa_preferred_nid)
2038 sched_setnuma(p, nid);
2039 }
2040
2041
2042 if (env.best_cpu == -1) {
2043 trace_sched_stick_numa(p, env.src_cpu, NULL, -1);
2044 return -EAGAIN;
2045 }
2046
2047 best_rq = cpu_rq(env.best_cpu);
2048 if (env.best_task == NULL) {
2049 ret = migrate_task_to(p, env.best_cpu);
2050 WRITE_ONCE(best_rq->numa_migrate_on, 0);
2051 if (ret != 0)
2052 trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu);
2053 return ret;
2054 }
2055
2056 ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
2057 WRITE_ONCE(best_rq->numa_migrate_on, 0);
2058
2059 if (ret != 0)
2060 trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu);
2061 put_task_struct(env.best_task);
2062 return ret;
2063}
2064
2065
2066static void numa_migrate_preferred(struct task_struct *p)
2067{
2068 unsigned long interval = HZ;
2069
2070
2071 if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
2072 return;
2073
2074
2075 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
2076 p->numa_migrate_retry = jiffies + interval;
2077
2078
2079 if (task_node(p) == p->numa_preferred_nid)
2080 return;
2081
2082
2083 task_numa_migrate(p);
2084}
2085
2086
2087
2088
2089
2090
2091
2092static void numa_group_count_active_nodes(struct numa_group *numa_group)
2093{
2094 unsigned long faults, max_faults = 0;
2095 int nid, active_nodes = 0;
2096
2097 for_each_online_node(nid) {
2098 faults = group_faults_cpu(numa_group, nid);
2099 if (faults > max_faults)
2100 max_faults = faults;
2101 }
2102
2103 for_each_online_node(nid) {
2104 faults = group_faults_cpu(numa_group, nid);
2105 if (faults * ACTIVE_NODE_FRACTION > max_faults)
2106 active_nodes++;
2107 }
2108
2109 numa_group->max_faults_cpu = max_faults;
2110 numa_group->active_nodes = active_nodes;
2111}
2112
2113
2114
2115
2116
2117
2118
2119
2120#define NUMA_PERIOD_SLOTS 10
2121#define NUMA_PERIOD_THRESHOLD 7
2122
2123
2124
2125
2126
2127
2128
2129static void update_task_scan_period(struct task_struct *p,
2130 unsigned long shared, unsigned long private)
2131{
2132 unsigned int period_slot;
2133 int lr_ratio, ps_ratio;
2134 int diff;
2135
2136 unsigned long remote = p->numa_faults_locality[0];
2137 unsigned long local = p->numa_faults_locality[1];
2138
2139
2140
2141
2142
2143
2144
2145
2146 if (local + shared == 0 || p->numa_faults_locality[2]) {
2147 p->numa_scan_period = min(p->numa_scan_period_max,
2148 p->numa_scan_period << 1);
2149
2150 p->mm->numa_next_scan = jiffies +
2151 msecs_to_jiffies(p->numa_scan_period);
2152
2153 return;
2154 }
2155
2156
2157
2158
2159
2160
2161
2162 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
2163 lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
2164 ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
2165
2166 if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
2167
2168
2169
2170
2171 int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
2172 if (!slot)
2173 slot = 1;
2174 diff = slot * period_slot;
2175 } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
2176
2177
2178
2179
2180
2181 int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
2182 if (!slot)
2183 slot = 1;
2184 diff = slot * period_slot;
2185 } else {
2186
2187
2188
2189
2190
2191 int ratio = max(lr_ratio, ps_ratio);
2192 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
2193 }
2194
2195 p->numa_scan_period = clamp(p->numa_scan_period + diff,
2196 task_scan_min(p), task_scan_max(p));
2197 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2198}
2199
2200
2201
2202
2203
2204
2205
2206
2207static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
2208{
2209 u64 runtime, delta, now;
2210
2211 now = p->se.exec_start;
2212 runtime = p->se.sum_exec_runtime;
2213
2214 if (p->last_task_numa_placement) {
2215 delta = runtime - p->last_sum_exec_runtime;
2216 *period = now - p->last_task_numa_placement;
2217
2218
2219 if (unlikely((s64)*period < 0))
2220 *period = 0;
2221 } else {
2222 delta = p->se.avg.load_sum;
2223 *period = LOAD_AVG_MAX;
2224 }
2225
2226 p->last_sum_exec_runtime = runtime;
2227 p->last_task_numa_placement = now;
2228
2229 return delta;
2230}
2231
2232
2233
2234
2235
2236
2237static int preferred_group_nid(struct task_struct *p, int nid)
2238{
2239 nodemask_t nodes;
2240 int dist;
2241
2242
2243 if (sched_numa_topology_type == NUMA_DIRECT)
2244 return nid;
2245
2246
2247
2248
2249
2250
2251 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
2252 unsigned long score, max_score = 0;
2253 int node, max_node = nid;
2254
2255 dist = sched_max_numa_distance;
2256
2257 for_each_online_node(node) {
2258 score = group_weight(p, node, dist);
2259 if (score > max_score) {
2260 max_score = score;
2261 max_node = node;
2262 }
2263 }
2264 return max_node;
2265 }
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276 nodes = node_online_map;
2277 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
2278 unsigned long max_faults = 0;
2279 nodemask_t max_group = NODE_MASK_NONE;
2280 int a, b;
2281
2282
2283 if (!find_numa_distance(dist))
2284 continue;
2285
2286 for_each_node_mask(a, nodes) {
2287 unsigned long faults = 0;
2288 nodemask_t this_group;
2289 nodes_clear(this_group);
2290
2291
2292 for_each_node_mask(b, nodes) {
2293 if (node_distance(a, b) < dist) {
2294 faults += group_faults(p, b);
2295 node_set(b, this_group);
2296 node_clear(b, nodes);
2297 }
2298 }
2299
2300
2301 if (faults > max_faults) {
2302 max_faults = faults;
2303 max_group = this_group;
2304
2305
2306
2307
2308
2309 nid = a;
2310 }
2311 }
2312
2313 if (!max_faults)
2314 break;
2315 nodes = max_group;
2316 }
2317 return nid;
2318}
2319
2320static void task_numa_placement(struct task_struct *p)
2321{
2322 int seq, nid, max_nid = NUMA_NO_NODE;
2323 unsigned long max_faults = 0;
2324 unsigned long fault_types[2] = { 0, 0 };
2325 unsigned long total_faults;
2326 u64 runtime, period;
2327 spinlock_t *group_lock = NULL;
2328 struct numa_group *ng;
2329
2330
2331
2332
2333
2334
2335 seq = READ_ONCE(p->mm->numa_scan_seq);
2336 if (p->numa_scan_seq == seq)
2337 return;
2338 p->numa_scan_seq = seq;
2339 p->numa_scan_period_max = task_scan_max(p);
2340
2341 total_faults = p->numa_faults_locality[0] +
2342 p->numa_faults_locality[1];
2343 runtime = numa_get_avg_runtime(p, &period);
2344
2345
2346 ng = deref_curr_numa_group(p);
2347 if (ng) {
2348 group_lock = &ng->lock;
2349 spin_lock_irq(group_lock);
2350 }
2351
2352
2353 for_each_online_node(nid) {
2354
2355 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
2356 unsigned long faults = 0, group_faults = 0;
2357 int priv;
2358
2359 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
2360 long diff, f_diff, f_weight;
2361
2362 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2363 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2364 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2365 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
2366
2367
2368 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2369 fault_types[priv] += p->numa_faults[membuf_idx];
2370 p->numa_faults[membuf_idx] = 0;
2371
2372
2373
2374
2375
2376
2377
2378
2379 f_weight = div64_u64(runtime << 16, period + 1);
2380 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
2381 (total_faults + 1);
2382 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2383 p->numa_faults[cpubuf_idx] = 0;
2384
2385 p->numa_faults[mem_idx] += diff;
2386 p->numa_faults[cpu_idx] += f_diff;
2387 faults += p->numa_faults[mem_idx];
2388 p->total_numa_faults += diff;
2389 if (ng) {
2390
2391
2392
2393
2394
2395
2396
2397 ng->faults[mem_idx] += diff;
2398 ng->faults_cpu[mem_idx] += f_diff;
2399 ng->total_faults += diff;
2400 group_faults += ng->faults[mem_idx];
2401 }
2402 }
2403
2404 if (!ng) {
2405 if (faults > max_faults) {
2406 max_faults = faults;
2407 max_nid = nid;
2408 }
2409 } else if (group_faults > max_faults) {
2410 max_faults = group_faults;
2411 max_nid = nid;
2412 }
2413 }
2414
2415 if (ng) {
2416 numa_group_count_active_nodes(ng);
2417 spin_unlock_irq(group_lock);
2418 max_nid = preferred_group_nid(p, max_nid);
2419 }
2420
2421 if (max_faults) {
2422
2423 if (max_nid != p->numa_preferred_nid)
2424 sched_setnuma(p, max_nid);
2425 }
2426
2427 update_task_scan_period(p, fault_types[0], fault_types[1]);
2428}
2429
2430static inline int get_numa_group(struct numa_group *grp)
2431{
2432 return refcount_inc_not_zero(&grp->refcount);
2433}
2434
2435static inline void put_numa_group(struct numa_group *grp)
2436{
2437 if (refcount_dec_and_test(&grp->refcount))
2438 kfree_rcu(grp, rcu);
2439}
2440
2441static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2442 int *priv)
2443{
2444 struct numa_group *grp, *my_grp;
2445 struct task_struct *tsk;
2446 bool join = false;
2447 int cpu = cpupid_to_cpu(cpupid);
2448 int i;
2449
2450 if (unlikely(!deref_curr_numa_group(p))) {
2451 unsigned int size = sizeof(struct numa_group) +
2452 4*nr_node_ids*sizeof(unsigned long);
2453
2454 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2455 if (!grp)
2456 return;
2457
2458 refcount_set(&grp->refcount, 1);
2459 grp->active_nodes = 1;
2460 grp->max_faults_cpu = 0;
2461 spin_lock_init(&grp->lock);
2462 grp->gid = p->pid;
2463
2464 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2465 nr_node_ids;
2466
2467 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2468 grp->faults[i] = p->numa_faults[i];
2469
2470 grp->total_faults = p->total_numa_faults;
2471
2472 grp->nr_tasks++;
2473 rcu_assign_pointer(p->numa_group, grp);
2474 }
2475
2476 rcu_read_lock();
2477 tsk = READ_ONCE(cpu_rq(cpu)->curr);
2478
2479 if (!cpupid_match_pid(tsk, cpupid))
2480 goto no_join;
2481
2482 grp = rcu_dereference(tsk->numa_group);
2483 if (!grp)
2484 goto no_join;
2485
2486 my_grp = deref_curr_numa_group(p);
2487 if (grp == my_grp)
2488 goto no_join;
2489
2490
2491
2492
2493
2494 if (my_grp->nr_tasks > grp->nr_tasks)
2495 goto no_join;
2496
2497
2498
2499
2500 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2501 goto no_join;
2502
2503
2504 if (tsk->mm == current->mm)
2505 join = true;
2506
2507
2508 if (flags & TNF_SHARED)
2509 join = true;
2510
2511
2512 *priv = !join;
2513
2514 if (join && !get_numa_group(grp))
2515 goto no_join;
2516
2517 rcu_read_unlock();
2518
2519 if (!join)
2520 return;
2521
2522 BUG_ON(irqs_disabled());
2523 double_lock_irq(&my_grp->lock, &grp->lock);
2524
2525 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2526 my_grp->faults[i] -= p->numa_faults[i];
2527 grp->faults[i] += p->numa_faults[i];
2528 }
2529 my_grp->total_faults -= p->total_numa_faults;
2530 grp->total_faults += p->total_numa_faults;
2531
2532 my_grp->nr_tasks--;
2533 grp->nr_tasks++;
2534
2535 spin_unlock(&my_grp->lock);
2536 spin_unlock_irq(&grp->lock);
2537
2538 rcu_assign_pointer(p->numa_group, grp);
2539
2540 put_numa_group(my_grp);
2541 return;
2542
2543no_join:
2544 rcu_read_unlock();
2545 return;
2546}
2547
2548
2549
2550
2551
2552
2553
2554
2555void task_numa_free(struct task_struct *p, bool final)
2556{
2557
2558 struct numa_group *grp = rcu_dereference_raw(p->numa_group);
2559 unsigned long *numa_faults = p->numa_faults;
2560 unsigned long flags;
2561 int i;
2562
2563 if (!numa_faults)
2564 return;
2565
2566 if (grp) {
2567 spin_lock_irqsave(&grp->lock, flags);
2568 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2569 grp->faults[i] -= p->numa_faults[i];
2570 grp->total_faults -= p->total_numa_faults;
2571
2572 grp->nr_tasks--;
2573 spin_unlock_irqrestore(&grp->lock, flags);
2574 RCU_INIT_POINTER(p->numa_group, NULL);
2575 put_numa_group(grp);
2576 }
2577
2578 if (final) {
2579 p->numa_faults = NULL;
2580 kfree(numa_faults);
2581 } else {
2582 p->total_numa_faults = 0;
2583 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2584 numa_faults[i] = 0;
2585 }
2586}
2587
2588
2589
2590
2591void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2592{
2593 struct task_struct *p = current;
2594 bool migrated = flags & TNF_MIGRATED;
2595 int cpu_node = task_node(current);
2596 int local = !!(flags & TNF_FAULT_LOCAL);
2597 struct numa_group *ng;
2598 int priv;
2599
2600 if (!static_branch_likely(&sched_numa_balancing))
2601 return;
2602
2603
2604 if (!p->mm)
2605 return;
2606
2607
2608 if (unlikely(!p->numa_faults)) {
2609 int size = sizeof(*p->numa_faults) *
2610 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2611
2612 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2613 if (!p->numa_faults)
2614 return;
2615
2616 p->total_numa_faults = 0;
2617 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2618 }
2619
2620
2621
2622
2623
2624 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2625 priv = 1;
2626 } else {
2627 priv = cpupid_match_pid(p, last_cpupid);
2628 if (!priv && !(flags & TNF_NO_GROUP))
2629 task_numa_group(p, last_cpupid, flags, &priv);
2630 }
2631
2632
2633
2634
2635
2636
2637
2638 ng = deref_curr_numa_group(p);
2639 if (!priv && !local && ng && ng->active_nodes > 1 &&
2640 numa_is_active_node(cpu_node, ng) &&
2641 numa_is_active_node(mem_node, ng))
2642 local = 1;
2643
2644
2645
2646
2647
2648 if (time_after(jiffies, p->numa_migrate_retry)) {
2649 task_numa_placement(p);
2650 numa_migrate_preferred(p);
2651 }
2652
2653 if (migrated)
2654 p->numa_pages_migrated += pages;
2655 if (flags & TNF_MIGRATE_FAIL)
2656 p->numa_faults_locality[2] += pages;
2657
2658 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2659 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2660 p->numa_faults_locality[local] += pages;
2661}
2662
2663static void reset_ptenuma_scan(struct task_struct *p)
2664{
2665
2666
2667
2668
2669
2670
2671
2672
2673 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2674 p->mm->numa_scan_offset = 0;
2675}
2676
2677
2678
2679
2680
2681static void task_numa_work(struct callback_head *work)
2682{
2683 unsigned long migrate, next_scan, now = jiffies;
2684 struct task_struct *p = current;
2685 struct mm_struct *mm = p->mm;
2686 u64 runtime = p->se.sum_exec_runtime;
2687 struct vm_area_struct *vma;
2688 unsigned long start, end;
2689 unsigned long nr_pte_updates = 0;
2690 long pages, virtpages;
2691
2692 SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
2693
2694 work->next = work;
2695
2696
2697
2698
2699
2700
2701
2702
2703 if (p->flags & PF_EXITING)
2704 return;
2705
2706 if (!mm->numa_next_scan) {
2707 mm->numa_next_scan = now +
2708 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2709 }
2710
2711
2712
2713
2714 migrate = mm->numa_next_scan;
2715 if (time_before(now, migrate))
2716 return;
2717
2718 if (p->numa_scan_period == 0) {
2719 p->numa_scan_period_max = task_scan_max(p);
2720 p->numa_scan_period = task_scan_start(p);
2721 }
2722
2723 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2724 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2725 return;
2726
2727
2728
2729
2730
2731 p->node_stamp += 2 * TICK_NSEC;
2732
2733 start = mm->numa_scan_offset;
2734 pages = sysctl_numa_balancing_scan_size;
2735 pages <<= 20 - PAGE_SHIFT;
2736 virtpages = pages * 8;
2737 if (!pages)
2738 return;
2739
2740
2741 if (!mmap_read_trylock(mm))
2742 return;
2743 vma = find_vma(mm, start);
2744 if (!vma) {
2745 reset_ptenuma_scan(p);
2746 start = 0;
2747 vma = mm->mmap;
2748 }
2749 for (; vma; vma = vma->vm_next) {
2750 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2751 is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2752 continue;
2753 }
2754
2755
2756
2757
2758
2759
2760
2761 if (!vma->vm_mm ||
2762 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2763 continue;
2764
2765
2766
2767
2768
2769 if (!vma_is_accessible(vma))
2770 continue;
2771
2772 do {
2773 start = max(start, vma->vm_start);
2774 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2775 end = min(end, vma->vm_end);
2776 nr_pte_updates = change_prot_numa(vma, start, end);
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786 if (nr_pte_updates)
2787 pages -= (end - start) >> PAGE_SHIFT;
2788 virtpages -= (end - start) >> PAGE_SHIFT;
2789
2790 start = end;
2791 if (pages <= 0 || virtpages <= 0)
2792 goto out;
2793
2794 cond_resched();
2795 } while (end != vma->vm_end);
2796 }
2797
2798out:
2799
2800
2801
2802
2803
2804
2805 if (vma)
2806 mm->numa_scan_offset = start;
2807 else
2808 reset_ptenuma_scan(p);
2809 mmap_read_unlock(mm);
2810
2811
2812
2813
2814
2815
2816
2817 if (unlikely(p->se.sum_exec_runtime != runtime)) {
2818 u64 diff = p->se.sum_exec_runtime - runtime;
2819 p->node_stamp += 32 * diff;
2820 }
2821}
2822
2823void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
2824{
2825 int mm_users = 0;
2826 struct mm_struct *mm = p->mm;
2827
2828 if (mm) {
2829 mm_users = atomic_read(&mm->mm_users);
2830 if (mm_users == 1) {
2831 mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2832 mm->numa_scan_seq = 0;
2833 }
2834 }
2835 p->node_stamp = 0;
2836 p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
2837 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
2838
2839 p->numa_work.next = &p->numa_work;
2840 p->numa_faults = NULL;
2841 RCU_INIT_POINTER(p->numa_group, NULL);
2842 p->last_task_numa_placement = 0;
2843 p->last_sum_exec_runtime = 0;
2844
2845 init_task_work(&p->numa_work, task_numa_work);
2846
2847
2848 if (!(clone_flags & CLONE_VM)) {
2849 p->numa_preferred_nid = NUMA_NO_NODE;
2850 return;
2851 }
2852
2853
2854
2855
2856
2857 if (mm) {
2858 unsigned int delay;
2859
2860 delay = min_t(unsigned int, task_scan_max(current),
2861 current->numa_scan_period * mm_users * NSEC_PER_MSEC);
2862 delay += 2 * TICK_NSEC;
2863 p->node_stamp = delay;
2864 }
2865}
2866
2867
2868
2869
2870static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2871{
2872 struct callback_head *work = &curr->numa_work;
2873 u64 period, now;
2874
2875
2876
2877
2878 if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
2879 return;
2880
2881
2882
2883
2884
2885
2886
2887 now = curr->se.sum_exec_runtime;
2888 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2889
2890 if (now > curr->node_stamp + period) {
2891 if (!curr->node_stamp)
2892 curr->numa_scan_period = task_scan_start(curr);
2893 curr->node_stamp += period;
2894
2895 if (!time_before(jiffies, curr->mm->numa_next_scan))
2896 task_work_add(curr, work, TWA_RESUME);
2897 }
2898}
2899
2900static void update_scan_period(struct task_struct *p, int new_cpu)
2901{
2902 int src_nid = cpu_to_node(task_cpu(p));
2903 int dst_nid = cpu_to_node(new_cpu);
2904
2905 if (!static_branch_likely(&sched_numa_balancing))
2906 return;
2907
2908 if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
2909 return;
2910
2911 if (src_nid == dst_nid)
2912 return;
2913
2914
2915
2916
2917
2918
2919 if (p->numa_scan_seq) {
2920
2921
2922
2923
2924
2925 if (dst_nid == p->numa_preferred_nid ||
2926 (p->numa_preferred_nid != NUMA_NO_NODE &&
2927 src_nid != p->numa_preferred_nid))
2928 return;
2929 }
2930
2931 p->numa_scan_period = task_scan_start(p);
2932}
2933
2934#else
2935static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2936{
2937}
2938
2939static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2940{
2941}
2942
2943static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2944{
2945}
2946
2947static inline void update_scan_period(struct task_struct *p, int new_cpu)
2948{
2949}
2950
2951#endif
2952
2953static void
2954account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2955{
2956 update_load_add(&cfs_rq->load, se->load.weight);
2957#ifdef CONFIG_SMP
2958 if (entity_is_task(se)) {
2959 struct rq *rq = rq_of(cfs_rq);
2960
2961 account_numa_enqueue(rq, task_of(se));
2962 list_add(&se->group_node, &rq->cfs_tasks);
2963 }
2964#endif
2965 cfs_rq->nr_running++;
2966}
2967
2968static void
2969account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2970{
2971 update_load_sub(&cfs_rq->load, se->load.weight);
2972#ifdef CONFIG_SMP
2973 if (entity_is_task(se)) {
2974 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2975 list_del_init(&se->group_node);
2976 }
2977#endif
2978 cfs_rq->nr_running--;
2979}
2980
2981
2982
2983
2984
2985
2986
2987
2988#define add_positive(_ptr, _val) do { \
2989 typeof(_ptr) ptr = (_ptr); \
2990 typeof(_val) val = (_val); \
2991 typeof(*ptr) res, var = READ_ONCE(*ptr); \
2992 \
2993 res = var + val; \
2994 \
2995 if (val < 0 && res > var) \
2996 res = 0; \
2997 \
2998 WRITE_ONCE(*ptr, res); \
2999} while (0)
3000
3001
3002
3003
3004
3005
3006
3007
3008#define sub_positive(_ptr, _val) do { \
3009 typeof(_ptr) ptr = (_ptr); \
3010 typeof(*ptr) val = (_val); \
3011 typeof(*ptr) res, var = READ_ONCE(*ptr); \
3012 res = var - val; \
3013 if (res > var) \
3014 res = 0; \
3015 WRITE_ONCE(*ptr, res); \
3016} while (0)
3017
3018
3019
3020
3021
3022
3023
3024#define lsub_positive(_ptr, _val) do { \
3025 typeof(_ptr) ptr = (_ptr); \
3026 *ptr -= min_t(typeof(*ptr), *ptr, _val); \
3027} while (0)
3028
3029#ifdef CONFIG_SMP
3030static inline void
3031enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3032{
3033 cfs_rq->avg.load_avg += se->avg.load_avg;
3034 cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
3035}
3036
3037static inline void
3038dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3039{
3040 u32 divider = get_pelt_divider(&se->avg);
3041 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
3042 cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
3043}
3044#else
3045static inline void
3046enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
3047static inline void
3048dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
3049#endif
3050
3051static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
3052 unsigned long weight)
3053{
3054 if (se->on_rq) {
3055
3056 if (cfs_rq->curr == se)
3057 update_curr(cfs_rq);
3058 update_load_sub(&cfs_rq->load, se->load.weight);
3059 }
3060 dequeue_load_avg(cfs_rq, se);
3061
3062 update_load_set(&se->load, weight);
3063
3064#ifdef CONFIG_SMP
3065 do {
3066 u32 divider = get_pelt_divider(&se->avg);
3067
3068 se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
3069 } while (0);
3070#endif
3071
3072 enqueue_load_avg(cfs_rq, se);
3073 if (se->on_rq)
3074 update_load_add(&cfs_rq->load, se->load.weight);
3075
3076}
3077
3078void reweight_task(struct task_struct *p, int prio)
3079{
3080 struct sched_entity *se = &p->se;
3081 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3082 struct load_weight *load = &se->load;
3083 unsigned long weight = scale_load(sched_prio_to_weight[prio]);
3084
3085 reweight_entity(cfs_rq, se, weight);
3086 load->inv_weight = sched_prio_to_wmult[prio];
3087}
3088
3089#ifdef CONFIG_FAIR_GROUP_SCHED
3090#ifdef CONFIG_SMP
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164static long calc_group_shares(struct cfs_rq *cfs_rq)
3165{
3166 long tg_weight, tg_shares, load, shares;
3167 struct task_group *tg = cfs_rq->tg;
3168
3169 tg_shares = READ_ONCE(tg->shares);
3170
3171 load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
3172
3173 tg_weight = atomic_long_read(&tg->load_avg);
3174
3175
3176 tg_weight -= cfs_rq->tg_load_avg_contrib;
3177 tg_weight += load;
3178
3179 shares = (tg_shares * load);
3180 if (tg_weight)
3181 shares /= tg_weight;
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195 return clamp_t(long, shares, MIN_SHARES, tg_shares);
3196}
3197#endif
3198
3199static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
3200
3201
3202
3203
3204
3205static void update_cfs_group(struct sched_entity *se)
3206{
3207 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3208 long shares;
3209
3210 if (!gcfs_rq)
3211 return;
3212
3213 if (throttled_hierarchy(gcfs_rq))
3214 return;
3215
3216#ifndef CONFIG_SMP
3217 shares = READ_ONCE(gcfs_rq->tg->shares);
3218
3219 if (likely(se->load.weight == shares))
3220 return;
3221#else
3222 shares = calc_group_shares(gcfs_rq);
3223#endif
3224
3225 reweight_entity(cfs_rq_of(se), se, shares);
3226}
3227
3228#else
3229static inline void update_cfs_group(struct sched_entity *se)
3230{
3231}
3232#endif
3233
3234static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
3235{
3236 struct rq *rq = rq_of(cfs_rq);
3237
3238 if (&rq->cfs == cfs_rq) {
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253 cpufreq_update_util(rq, flags);
3254 }
3255}
3256
3257#ifdef CONFIG_SMP
3258#ifdef CONFIG_FAIR_GROUP_SCHED
3259
3260
3261
3262
3263
3264
3265
3266
3267static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq)
3268{
3269 struct cfs_rq *prev_cfs_rq;
3270 struct list_head *prev;
3271
3272 if (cfs_rq->on_list) {
3273 prev = cfs_rq->leaf_cfs_rq_list.prev;
3274 } else {
3275 struct rq *rq = rq_of(cfs_rq);
3276
3277 prev = rq->tmp_alone_branch;
3278 }
3279
3280 prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list);
3281
3282 return (prev_cfs_rq->tg->parent == cfs_rq->tg);
3283}
3284
3285static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
3286{
3287 if (cfs_rq->load.weight)
3288 return false;
3289
3290 if (cfs_rq->avg.load_sum)
3291 return false;
3292
3293 if (cfs_rq->avg.util_sum)
3294 return false;
3295
3296 if (cfs_rq->avg.runnable_sum)
3297 return false;
3298
3299 if (child_cfs_rq_on_list(cfs_rq))
3300 return false;
3301
3302
3303
3304
3305
3306
3307 SCHED_WARN_ON(cfs_rq->avg.load_avg ||
3308 cfs_rq->avg.util_avg ||
3309 cfs_rq->avg.runnable_avg);
3310
3311 return true;
3312}
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
3329{
3330 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
3331
3332
3333
3334
3335 if (cfs_rq->tg == &root_task_group)
3336 return;
3337
3338 if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
3339 atomic_long_add(delta, &cfs_rq->tg->load_avg);
3340 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
3341 }
3342}
3343
3344
3345
3346
3347
3348
3349void set_task_rq_fair(struct sched_entity *se,
3350 struct cfs_rq *prev, struct cfs_rq *next)
3351{
3352 u64 p_last_update_time;
3353 u64 n_last_update_time;
3354
3355 if (!sched_feat(ATTACH_AGE_LOAD))
3356 return;
3357
3358
3359
3360
3361
3362
3363
3364
3365 if (!(se->avg.last_update_time && prev))
3366 return;
3367
3368#ifndef CONFIG_64BIT
3369 {
3370 u64 p_last_update_time_copy;
3371 u64 n_last_update_time_copy;
3372
3373 do {
3374 p_last_update_time_copy = prev->load_last_update_time_copy;
3375 n_last_update_time_copy = next->load_last_update_time_copy;
3376
3377 smp_rmb();
3378
3379 p_last_update_time = prev->avg.last_update_time;
3380 n_last_update_time = next->avg.last_update_time;
3381
3382 } while (p_last_update_time != p_last_update_time_copy ||
3383 n_last_update_time != n_last_update_time_copy);
3384 }
3385#else
3386 p_last_update_time = prev->avg.last_update_time;
3387 n_last_update_time = next->avg.last_update_time;
3388#endif
3389 __update_load_avg_blocked_se(p_last_update_time, se);
3390 se->avg.last_update_time = n_last_update_time;
3391}
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462static inline void
3463update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3464{
3465 long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
3466 u32 divider;
3467
3468
3469 if (!delta)
3470 return;
3471
3472
3473
3474
3475
3476 divider = get_pelt_divider(&cfs_rq->avg);
3477
3478
3479 se->avg.util_avg = gcfs_rq->avg.util_avg;
3480 se->avg.util_sum = se->avg.util_avg * divider;
3481
3482
3483 add_positive(&cfs_rq->avg.util_avg, delta);
3484 cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
3485}
3486
3487static inline void
3488update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3489{
3490 long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
3491 u32 divider;
3492
3493
3494 if (!delta)
3495 return;
3496
3497
3498
3499
3500
3501 divider = get_pelt_divider(&cfs_rq->avg);
3502
3503
3504 se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
3505 se->avg.runnable_sum = se->avg.runnable_avg * divider;
3506
3507
3508 add_positive(&cfs_rq->avg.runnable_avg, delta);
3509 cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
3510}
3511
3512static inline void
3513update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3514{
3515 long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
3516 unsigned long load_avg;
3517 u64 load_sum = 0;
3518 u32 divider;
3519
3520 if (!runnable_sum)
3521 return;
3522
3523 gcfs_rq->prop_runnable_sum = 0;
3524
3525
3526
3527
3528
3529 divider = get_pelt_divider(&cfs_rq->avg);
3530
3531 if (runnable_sum >= 0) {
3532
3533
3534
3535
3536 runnable_sum += se->avg.load_sum;
3537 runnable_sum = min_t(long, runnable_sum, divider);
3538 } else {
3539
3540
3541
3542
3543 if (scale_load_down(gcfs_rq->load.weight)) {
3544 load_sum = div_s64(gcfs_rq->avg.load_sum,
3545 scale_load_down(gcfs_rq->load.weight));
3546 }
3547
3548
3549 runnable_sum = min(se->avg.load_sum, load_sum);
3550 }
3551
3552
3553
3554
3555
3556
3557
3558 running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
3559 runnable_sum = max(runnable_sum, running_sum);
3560
3561 load_sum = (s64)se_weight(se) * runnable_sum;
3562 load_avg = div_s64(load_sum, divider);
3563
3564 se->avg.load_sum = runnable_sum;
3565
3566 delta = load_avg - se->avg.load_avg;
3567 if (!delta)
3568 return;
3569
3570 se->avg.load_avg = load_avg;
3571
3572 add_positive(&cfs_rq->avg.load_avg, delta);
3573 cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
3574}
3575
3576static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
3577{
3578 cfs_rq->propagate = 1;
3579 cfs_rq->prop_runnable_sum += runnable_sum;
3580}
3581
3582
3583static inline int propagate_entity_load_avg(struct sched_entity *se)
3584{
3585 struct cfs_rq *cfs_rq, *gcfs_rq;
3586
3587 if (entity_is_task(se))
3588 return 0;
3589
3590 gcfs_rq = group_cfs_rq(se);
3591 if (!gcfs_rq->propagate)
3592 return 0;
3593
3594 gcfs_rq->propagate = 0;
3595
3596 cfs_rq = cfs_rq_of(se);
3597
3598 add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
3599
3600 update_tg_cfs_util(cfs_rq, se, gcfs_rq);
3601 update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
3602 update_tg_cfs_load(cfs_rq, se, gcfs_rq);
3603
3604 trace_pelt_cfs_tp(cfs_rq);
3605 trace_pelt_se_tp(se);
3606
3607 return 1;
3608}
3609
3610
3611
3612
3613
3614static inline bool skip_blocked_update(struct sched_entity *se)
3615{
3616 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3617
3618
3619
3620
3621
3622 if (se->avg.load_avg || se->avg.util_avg)
3623 return false;
3624
3625
3626
3627
3628
3629 if (gcfs_rq->propagate)
3630 return false;
3631
3632
3633
3634
3635
3636
3637 return true;
3638}
3639
3640#else
3641
3642static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
3643
3644static inline int propagate_entity_load_avg(struct sched_entity *se)
3645{
3646 return 0;
3647}
3648
3649static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
3650
3651#endif
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669static inline int
3670update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3671{
3672 unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
3673 struct sched_avg *sa = &cfs_rq->avg;
3674 int decayed = 0;
3675
3676 if (cfs_rq->removed.nr) {
3677 unsigned long r;
3678 u32 divider = get_pelt_divider(&cfs_rq->avg);
3679
3680 raw_spin_lock(&cfs_rq->removed.lock);
3681 swap(cfs_rq->removed.util_avg, removed_util);
3682 swap(cfs_rq->removed.load_avg, removed_load);
3683 swap(cfs_rq->removed.runnable_avg, removed_runnable);
3684 cfs_rq->removed.nr = 0;
3685 raw_spin_unlock(&cfs_rq->removed.lock);
3686
3687 r = removed_load;
3688 sub_positive(&sa->load_avg, r);
3689 sa->load_sum = sa->load_avg * divider;
3690
3691 r = removed_util;
3692 sub_positive(&sa->util_avg, r);
3693 sa->util_sum = sa->util_avg * divider;
3694
3695 r = removed_runnable;
3696 sub_positive(&sa->runnable_avg, r);
3697 sa->runnable_sum = sa->runnable_avg * divider;
3698
3699
3700
3701
3702
3703 add_tg_cfs_propagate(cfs_rq,
3704 -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
3705
3706 decayed = 1;
3707 }
3708
3709 decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
3710
3711#ifndef CONFIG_64BIT
3712 smp_wmb();
3713 cfs_rq->load_last_update_time_copy = sa->last_update_time;
3714#endif
3715
3716 return decayed;
3717}
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3728{
3729
3730
3731
3732
3733 u32 divider = get_pelt_divider(&cfs_rq->avg);
3734
3735
3736
3737
3738
3739
3740
3741
3742 se->avg.last_update_time = cfs_rq->avg.last_update_time;
3743 se->avg.period_contrib = cfs_rq->avg.period_contrib;
3744
3745
3746
3747
3748
3749
3750
3751 se->avg.util_sum = se->avg.util_avg * divider;
3752
3753 se->avg.runnable_sum = se->avg.runnable_avg * divider;
3754
3755 se->avg.load_sum = divider;
3756 if (se_weight(se)) {
3757 se->avg.load_sum =
3758 div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
3759 }
3760
3761 enqueue_load_avg(cfs_rq, se);
3762 cfs_rq->avg.util_avg += se->avg.util_avg;
3763 cfs_rq->avg.util_sum += se->avg.util_sum;
3764 cfs_rq->avg.runnable_avg += se->avg.runnable_avg;
3765 cfs_rq->avg.runnable_sum += se->avg.runnable_sum;
3766
3767 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
3768
3769 cfs_rq_util_change(cfs_rq, 0);
3770
3771 trace_pelt_cfs_tp(cfs_rq);
3772}
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3783{
3784
3785
3786
3787
3788 u32 divider = get_pelt_divider(&cfs_rq->avg);
3789
3790 dequeue_load_avg(cfs_rq, se);
3791 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3792 cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
3793 sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
3794 cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
3795
3796 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
3797
3798 cfs_rq_util_change(cfs_rq, 0);
3799
3800 trace_pelt_cfs_tp(cfs_rq);
3801}
3802
3803
3804
3805
3806#define UPDATE_TG 0x1
3807#define SKIP_AGE_LOAD 0x2
3808#define DO_ATTACH 0x4
3809
3810
3811static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3812{
3813 u64 now = cfs_rq_clock_pelt(cfs_rq);
3814 int decayed;
3815
3816
3817
3818
3819
3820 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
3821 __update_load_avg_se(now, cfs_rq, se);
3822
3823 decayed = update_cfs_rq_load_avg(now, cfs_rq);
3824 decayed |= propagate_entity_load_avg(se);
3825
3826 if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
3827
3828
3829
3830
3831
3832
3833
3834
3835 attach_entity_load_avg(cfs_rq, se);
3836 update_tg_load_avg(cfs_rq);
3837
3838 } else if (decayed) {
3839 cfs_rq_util_change(cfs_rq, 0);
3840
3841 if (flags & UPDATE_TG)
3842 update_tg_load_avg(cfs_rq);
3843 }
3844}
3845
3846#ifndef CONFIG_64BIT
3847static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3848{
3849 u64 last_update_time_copy;
3850 u64 last_update_time;
3851
3852 do {
3853 last_update_time_copy = cfs_rq->load_last_update_time_copy;
3854 smp_rmb();
3855 last_update_time = cfs_rq->avg.last_update_time;
3856 } while (last_update_time != last_update_time_copy);
3857
3858 return last_update_time;
3859}
3860#else
3861static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3862{
3863 return cfs_rq->avg.last_update_time;
3864}
3865#endif
3866
3867
3868
3869
3870
3871static void sync_entity_load_avg(struct sched_entity *se)
3872{
3873 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3874 u64 last_update_time;
3875
3876 last_update_time = cfs_rq_last_update_time(cfs_rq);
3877 __update_load_avg_blocked_se(last_update_time, se);
3878}
3879
3880
3881
3882
3883
3884static void remove_entity_load_avg(struct sched_entity *se)
3885{
3886 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3887 unsigned long flags;
3888
3889
3890
3891
3892
3893
3894
3895 sync_entity_load_avg(se);
3896
3897 raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
3898 ++cfs_rq->removed.nr;
3899 cfs_rq->removed.util_avg += se->avg.util_avg;
3900 cfs_rq->removed.load_avg += se->avg.load_avg;
3901 cfs_rq->removed.runnable_avg += se->avg.runnable_avg;
3902 raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
3903}
3904
3905static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
3906{
3907 return cfs_rq->avg.runnable_avg;
3908}
3909
3910static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3911{
3912 return cfs_rq->avg.load_avg;
3913}
3914
3915static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
3916
3917static inline unsigned long task_util(struct task_struct *p)
3918{
3919 return READ_ONCE(p->se.avg.util_avg);
3920}
3921
3922static inline unsigned long _task_util_est(struct task_struct *p)
3923{
3924 struct util_est ue = READ_ONCE(p->se.avg.util_est);
3925
3926 return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
3927}
3928
3929static inline unsigned long task_util_est(struct task_struct *p)
3930{
3931 return max(task_util(p), _task_util_est(p));
3932}
3933
3934#ifdef CONFIG_UCLAMP_TASK
3935static inline unsigned long uclamp_task_util(struct task_struct *p)
3936{
3937 return clamp(task_util_est(p),
3938 uclamp_eff_value(p, UCLAMP_MIN),
3939 uclamp_eff_value(p, UCLAMP_MAX));
3940}
3941#else
3942static inline unsigned long uclamp_task_util(struct task_struct *p)
3943{
3944 return task_util_est(p);
3945}
3946#endif
3947
3948static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
3949 struct task_struct *p)
3950{
3951 unsigned int enqueued;
3952
3953 if (!sched_feat(UTIL_EST))
3954 return;
3955
3956
3957 enqueued = cfs_rq->avg.util_est.enqueued;
3958 enqueued += _task_util_est(p);
3959 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3960
3961 trace_sched_util_est_cfs_tp(cfs_rq);
3962}
3963
3964static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
3965 struct task_struct *p)
3966{
3967 unsigned int enqueued;
3968
3969 if (!sched_feat(UTIL_EST))
3970 return;
3971
3972
3973 enqueued = cfs_rq->avg.util_est.enqueued;
3974 enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
3975 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3976
3977 trace_sched_util_est_cfs_tp(cfs_rq);
3978}
3979
3980#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990static inline bool within_margin(int value, int margin)
3991{
3992 return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
3993}
3994
3995static inline void util_est_update(struct cfs_rq *cfs_rq,
3996 struct task_struct *p,
3997 bool task_sleep)
3998{
3999 long last_ewma_diff, last_enqueued_diff;
4000 struct util_est ue;
4001
4002 if (!sched_feat(UTIL_EST))
4003 return;
4004
4005
4006
4007
4008
4009 if (!task_sleep)
4010 return;
4011
4012
4013
4014
4015
4016 ue = p->se.avg.util_est;
4017 if (ue.enqueued & UTIL_AVG_UNCHANGED)
4018 return;
4019
4020 last_enqueued_diff = ue.enqueued;
4021
4022
4023
4024
4025
4026 ue.enqueued = task_util(p);
4027 if (sched_feat(UTIL_EST_FASTUP)) {
4028 if (ue.ewma < ue.enqueued) {
4029 ue.ewma = ue.enqueued;
4030 goto done;
4031 }
4032 }
4033
4034
4035
4036
4037
4038 last_ewma_diff = ue.enqueued - ue.ewma;
4039 last_enqueued_diff -= ue.enqueued;
4040 if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) {
4041 if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
4042 goto done;
4043
4044 return;
4045 }
4046
4047
4048
4049
4050
4051 if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
4052 return;
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071 ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
4072 ue.ewma += last_ewma_diff;
4073 ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
4074done:
4075 ue.enqueued |= UTIL_AVG_UNCHANGED;
4076 WRITE_ONCE(p->se.avg.util_est, ue);
4077
4078 trace_sched_util_est_se_tp(&p->se);
4079}
4080
4081static inline int task_fits_capacity(struct task_struct *p, long capacity)
4082{
4083 return fits_capacity(uclamp_task_util(p), capacity);
4084}
4085
4086static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
4087{
4088 if (!static_branch_unlikely(&sched_asym_cpucapacity))
4089 return;
4090
4091 if (!p || p->nr_cpus_allowed == 1) {
4092 rq->misfit_task_load = 0;
4093 return;
4094 }
4095
4096 if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
4097 rq->misfit_task_load = 0;
4098 return;
4099 }
4100
4101
4102
4103
4104
4105 rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
4106}
4107
4108#else
4109
4110static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
4111{
4112 return true;
4113}
4114
4115#define UPDATE_TG 0x0
4116#define SKIP_AGE_LOAD 0x0
4117#define DO_ATTACH 0x0
4118
4119static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
4120{
4121 cfs_rq_util_change(cfs_rq, 0);
4122}
4123
4124static inline void remove_entity_load_avg(struct sched_entity *se) {}
4125
4126static inline void
4127attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4128static inline void
4129detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4130
4131static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
4132{
4133 return 0;
4134}
4135
4136static inline void
4137util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
4138
4139static inline void
4140util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
4141
4142static inline void
4143util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
4144 bool task_sleep) {}
4145static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
4146
4147#endif
4148
4149static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
4150{
4151#ifdef CONFIG_SCHED_DEBUG
4152 s64 d = se->vruntime - cfs_rq->min_vruntime;
4153
4154 if (d < 0)
4155 d = -d;
4156
4157 if (d > 3*sysctl_sched_latency)
4158 schedstat_inc(cfs_rq->nr_spread_over);
4159#endif
4160}
4161
4162static void
4163place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
4164{
4165 u64 vruntime = cfs_rq->min_vruntime;
4166
4167
4168
4169
4170
4171
4172
4173 if (initial && sched_feat(START_DEBIT))
4174 vruntime += sched_vslice(cfs_rq, se);
4175
4176
4177 if (!initial) {
4178 unsigned long thresh = sysctl_sched_latency;
4179
4180
4181
4182
4183
4184 if (sched_feat(GENTLE_FAIR_SLEEPERS))
4185 thresh >>= 1;
4186
4187 vruntime -= thresh;
4188 }
4189
4190
4191 se->vruntime = max_vruntime(se->vruntime, vruntime);
4192}
4193
4194static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
4195
4196static inline void check_schedstat_required(void)
4197{
4198#ifdef CONFIG_SCHEDSTATS
4199 if (schedstat_enabled())
4200 return;
4201
4202
4203 if (trace_sched_stat_wait_enabled() ||
4204 trace_sched_stat_sleep_enabled() ||
4205 trace_sched_stat_iowait_enabled() ||
4206 trace_sched_stat_blocked_enabled() ||
4207 trace_sched_stat_runtime_enabled()) {
4208 printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
4209 "stat_blocked and stat_runtime require the "
4210 "kernel parameter schedstats=enable or "
4211 "kernel.sched_schedstats=1\n");
4212 }
4213#endif
4214}
4215
4216static inline bool cfs_bandwidth_used(void);
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248static void
4249enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4250{
4251 bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
4252 bool curr = cfs_rq->curr == se;
4253
4254
4255
4256
4257
4258 if (renorm && curr)
4259 se->vruntime += cfs_rq->min_vruntime;
4260
4261 update_curr(cfs_rq);
4262
4263
4264
4265
4266
4267
4268
4269 if (renorm && !curr)
4270 se->vruntime += cfs_rq->min_vruntime;
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280 update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
4281 se_update_runnable(se);
4282 update_cfs_group(se);
4283 account_entity_enqueue(cfs_rq, se);
4284
4285 if (flags & ENQUEUE_WAKEUP)
4286 place_entity(cfs_rq, se, 0);
4287
4288 check_schedstat_required();
4289 update_stats_enqueue(cfs_rq, se, flags);
4290 check_spread(cfs_rq, se);
4291 if (!curr)
4292 __enqueue_entity(cfs_rq, se);
4293 se->on_rq = 1;
4294
4295
4296
4297
4298
4299
4300 if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
4301 list_add_leaf_cfs_rq(cfs_rq);
4302
4303 if (cfs_rq->nr_running == 1)
4304 check_enqueue_throttle(cfs_rq);
4305}
4306
4307static void __clear_buddies_last(struct sched_entity *se)
4308{
4309 for_each_sched_entity(se) {
4310 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4311 if (cfs_rq->last != se)
4312 break;
4313
4314 cfs_rq->last = NULL;
4315 }
4316}
4317
4318static void __clear_buddies_next(struct sched_entity *se)
4319{
4320 for_each_sched_entity(se) {
4321 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4322 if (cfs_rq->next != se)
4323 break;
4324
4325 cfs_rq->next = NULL;
4326 }
4327}
4328
4329static void __clear_buddies_skip(struct sched_entity *se)
4330{
4331 for_each_sched_entity(se) {
4332 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4333 if (cfs_rq->skip != se)
4334 break;
4335
4336 cfs_rq->skip = NULL;
4337 }
4338}
4339
4340static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
4341{
4342 if (cfs_rq->last == se)
4343 __clear_buddies_last(se);
4344
4345 if (cfs_rq->next == se)
4346 __clear_buddies_next(se);
4347
4348 if (cfs_rq->skip == se)
4349 __clear_buddies_skip(se);
4350}
4351
4352static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4353
4354static void
4355dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4356{
4357
4358
4359
4360 update_curr(cfs_rq);
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370 update_load_avg(cfs_rq, se, UPDATE_TG);
4371 se_update_runnable(se);
4372
4373 update_stats_dequeue(cfs_rq, se, flags);
4374
4375 clear_buddies(cfs_rq, se);
4376
4377 if (se != cfs_rq->curr)
4378 __dequeue_entity(cfs_rq, se);
4379 se->on_rq = 0;
4380 account_entity_dequeue(cfs_rq, se);
4381
4382
4383
4384
4385
4386
4387
4388 if (!(flags & DEQUEUE_SLEEP))
4389 se->vruntime -= cfs_rq->min_vruntime;
4390
4391
4392 return_cfs_rq_runtime(cfs_rq);
4393
4394 update_cfs_group(se);
4395
4396
4397
4398
4399
4400
4401
4402 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
4403 update_min_vruntime(cfs_rq);
4404}
4405
4406
4407
4408
4409static void
4410check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4411{
4412 unsigned long ideal_runtime, delta_exec;
4413 struct sched_entity *se;
4414 s64 delta;
4415
4416 ideal_runtime = sched_slice(cfs_rq, curr);
4417 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
4418 if (delta_exec > ideal_runtime) {
4419 resched_curr(rq_of(cfs_rq));
4420
4421
4422
4423
4424 clear_buddies(cfs_rq, curr);
4425 return;
4426 }
4427
4428
4429
4430
4431
4432
4433 if (delta_exec < sysctl_sched_min_granularity)
4434 return;
4435
4436 se = __pick_first_entity(cfs_rq);
4437 delta = curr->vruntime - se->vruntime;
4438
4439 if (delta < 0)
4440 return;
4441
4442 if (delta > ideal_runtime)
4443 resched_curr(rq_of(cfs_rq));
4444}
4445
4446static void
4447set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
4448{
4449 clear_buddies(cfs_rq, se);
4450
4451
4452 if (se->on_rq) {
4453
4454
4455
4456
4457
4458 update_stats_wait_end(cfs_rq, se);
4459 __dequeue_entity(cfs_rq, se);
4460 update_load_avg(cfs_rq, se, UPDATE_TG);
4461 }
4462
4463 update_stats_curr_start(cfs_rq, se);
4464 cfs_rq->curr = se;
4465
4466
4467
4468
4469
4470
4471 if (schedstat_enabled() &&
4472 rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
4473 schedstat_set(se->statistics.slice_max,
4474 max((u64)schedstat_val(se->statistics.slice_max),
4475 se->sum_exec_runtime - se->prev_sum_exec_runtime));
4476 }
4477
4478 se->prev_sum_exec_runtime = se->sum_exec_runtime;
4479}
4480
4481static int
4482wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
4483
4484
4485
4486
4487
4488
4489
4490
4491static struct sched_entity *
4492pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4493{
4494 struct sched_entity *left = __pick_first_entity(cfs_rq);
4495 struct sched_entity *se;
4496
4497
4498
4499
4500
4501 if (!left || (curr && entity_before(curr, left)))
4502 left = curr;
4503
4504 se = left;
4505
4506
4507
4508
4509
4510 if (cfs_rq->skip && cfs_rq->skip == se) {
4511 struct sched_entity *second;
4512
4513 if (se == curr) {
4514 second = __pick_first_entity(cfs_rq);
4515 } else {
4516 second = __pick_next_entity(se);
4517 if (!second || (curr && entity_before(curr, second)))
4518 second = curr;
4519 }
4520
4521 if (second && wakeup_preempt_entity(second, left) < 1)
4522 se = second;
4523 }
4524
4525 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
4526
4527
4528
4529 se = cfs_rq->next;
4530 } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
4531
4532
4533
4534 se = cfs_rq->last;
4535 }
4536
4537 return se;
4538}
4539
4540static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4541
4542static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
4543{
4544
4545
4546
4547
4548 if (prev->on_rq)
4549 update_curr(cfs_rq);
4550
4551
4552 check_cfs_rq_runtime(cfs_rq);
4553
4554 check_spread(cfs_rq, prev);
4555
4556 if (prev->on_rq) {
4557 update_stats_wait_start(cfs_rq, prev);
4558
4559 __enqueue_entity(cfs_rq, prev);
4560
4561 update_load_avg(cfs_rq, prev, 0);
4562 }
4563 cfs_rq->curr = NULL;
4564}
4565
4566static void
4567entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
4568{
4569
4570
4571
4572 update_curr(cfs_rq);
4573
4574
4575
4576
4577 update_load_avg(cfs_rq, curr, UPDATE_TG);
4578 update_cfs_group(curr);
4579
4580#ifdef CONFIG_SCHED_HRTICK
4581
4582
4583
4584
4585 if (queued) {
4586 resched_curr(rq_of(cfs_rq));
4587 return;
4588 }
4589
4590
4591
4592 if (!sched_feat(DOUBLE_TICK) &&
4593 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
4594 return;
4595#endif
4596
4597 if (cfs_rq->nr_running > 1)
4598 check_preempt_tick(cfs_rq, curr);
4599}
4600
4601
4602
4603
4604
4605
4606#ifdef CONFIG_CFS_BANDWIDTH
4607
4608#ifdef CONFIG_JUMP_LABEL
4609static struct static_key __cfs_bandwidth_used;
4610
4611static inline bool cfs_bandwidth_used(void)
4612{
4613 return static_key_false(&__cfs_bandwidth_used);
4614}
4615
4616void cfs_bandwidth_usage_inc(void)
4617{
4618 static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
4619}
4620
4621void cfs_bandwidth_usage_dec(void)
4622{
4623 static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
4624}
4625#else
4626static bool cfs_bandwidth_used(void)
4627{
4628 return true;
4629}
4630
4631void cfs_bandwidth_usage_inc(void) {}
4632void cfs_bandwidth_usage_dec(void) {}
4633#endif
4634
4635
4636
4637
4638
4639static inline u64 default_cfs_period(void)
4640{
4641 return 100000000ULL;
4642}
4643
4644static inline u64 sched_cfs_bandwidth_slice(void)
4645{
4646 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
4647}
4648
4649
4650
4651
4652
4653
4654
4655
4656void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
4657{
4658 if (unlikely(cfs_b->quota == RUNTIME_INF))
4659 return;
4660
4661 cfs_b->runtime += cfs_b->quota;
4662 cfs_b->runtime = min(cfs_b->runtime, cfs_b->quota + cfs_b->burst);
4663}
4664
4665static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4666{
4667 return &tg->cfs_bandwidth;
4668}
4669
4670
4671static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
4672 struct cfs_rq *cfs_rq, u64 target_runtime)
4673{
4674 u64 min_amount, amount = 0;
4675
4676 lockdep_assert_held(&cfs_b->lock);
4677
4678
4679 min_amount = target_runtime - cfs_rq->runtime_remaining;
4680
4681 if (cfs_b->quota == RUNTIME_INF)
4682 amount = min_amount;
4683 else {
4684 start_cfs_bandwidth(cfs_b);
4685
4686 if (cfs_b->runtime > 0) {
4687 amount = min(cfs_b->runtime, min_amount);
4688 cfs_b->runtime -= amount;
4689 cfs_b->idle = 0;
4690 }
4691 }
4692
4693 cfs_rq->runtime_remaining += amount;
4694
4695 return cfs_rq->runtime_remaining > 0;
4696}
4697
4698
4699static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4700{
4701 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4702 int ret;
4703
4704 raw_spin_lock(&cfs_b->lock);
4705 ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
4706 raw_spin_unlock(&cfs_b->lock);
4707
4708 return ret;
4709}
4710
4711static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4712{
4713
4714 cfs_rq->runtime_remaining -= delta_exec;
4715
4716 if (likely(cfs_rq->runtime_remaining > 0))
4717 return;
4718
4719 if (cfs_rq->throttled)
4720 return;
4721
4722
4723
4724
4725 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
4726 resched_curr(rq_of(cfs_rq));
4727}
4728
4729static __always_inline
4730void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4731{
4732 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
4733 return;
4734
4735 __account_cfs_rq_runtime(cfs_rq, delta_exec);
4736}
4737
4738static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4739{
4740 return cfs_bandwidth_used() && cfs_rq->throttled;
4741}
4742
4743
4744static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4745{
4746 return cfs_bandwidth_used() && cfs_rq->throttle_count;
4747}
4748
4749
4750
4751
4752
4753
4754static inline int throttled_lb_pair(struct task_group *tg,
4755 int src_cpu, int dest_cpu)
4756{
4757 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
4758
4759 src_cfs_rq = tg->cfs_rq[src_cpu];
4760 dest_cfs_rq = tg->cfs_rq[dest_cpu];
4761
4762 return throttled_hierarchy(src_cfs_rq) ||
4763 throttled_hierarchy(dest_cfs_rq);
4764}
4765
4766static int tg_unthrottle_up(struct task_group *tg, void *data)
4767{
4768 struct rq *rq = data;
4769 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4770
4771 cfs_rq->throttle_count--;
4772 if (!cfs_rq->throttle_count) {
4773 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
4774 cfs_rq->throttled_clock_task;
4775
4776
4777 if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running)
4778 list_add_leaf_cfs_rq(cfs_rq);
4779 }
4780
4781 return 0;
4782}
4783
4784static int tg_throttle_down(struct task_group *tg, void *data)
4785{
4786 struct rq *rq = data;
4787 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4788
4789
4790 if (!cfs_rq->throttle_count) {
4791 cfs_rq->throttled_clock_task = rq_clock_task(rq);
4792 list_del_leaf_cfs_rq(cfs_rq);
4793 }
4794 cfs_rq->throttle_count++;
4795
4796 return 0;
4797}
4798
4799static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
4800{
4801 struct rq *rq = rq_of(cfs_rq);
4802 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4803 struct sched_entity *se;
4804 long task_delta, idle_task_delta, dequeue = 1;
4805
4806 raw_spin_lock(&cfs_b->lock);
4807
4808 if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
4809
4810
4811
4812
4813
4814
4815
4816
4817 dequeue = 0;
4818 } else {
4819 list_add_tail_rcu(&cfs_rq->throttled_list,
4820 &cfs_b->throttled_cfs_rq);
4821 }
4822 raw_spin_unlock(&cfs_b->lock);
4823
4824 if (!dequeue)
4825 return false;
4826
4827 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
4828
4829
4830 rcu_read_lock();
4831 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
4832 rcu_read_unlock();
4833
4834 task_delta = cfs_rq->h_nr_running;
4835 idle_task_delta = cfs_rq->idle_h_nr_running;
4836 for_each_sched_entity(se) {
4837 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4838
4839 if (!se->on_rq)
4840 goto done;
4841
4842 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
4843
4844 qcfs_rq->h_nr_running -= task_delta;
4845 qcfs_rq->idle_h_nr_running -= idle_task_delta;
4846
4847 if (qcfs_rq->load.weight) {
4848
4849 se = parent_entity(se);
4850 break;
4851 }
4852 }
4853
4854 for_each_sched_entity(se) {
4855 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4856
4857 if (!se->on_rq)
4858 goto done;
4859
4860 update_load_avg(qcfs_rq, se, 0);
4861 se_update_runnable(se);
4862
4863 qcfs_rq->h_nr_running -= task_delta;
4864 qcfs_rq->idle_h_nr_running -= idle_task_delta;
4865 }
4866
4867
4868 sub_nr_running(rq, task_delta);
4869
4870done:
4871
4872
4873
4874
4875 cfs_rq->throttled = 1;
4876 cfs_rq->throttled_clock = rq_clock(rq);
4877 return true;
4878}
4879
4880void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
4881{
4882 struct rq *rq = rq_of(cfs_rq);
4883 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4884 struct sched_entity *se;
4885 long task_delta, idle_task_delta;
4886
4887 se = cfs_rq->tg->se[cpu_of(rq)];
4888
4889 cfs_rq->throttled = 0;
4890
4891 update_rq_clock(rq);
4892
4893 raw_spin_lock(&cfs_b->lock);
4894 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
4895 list_del_rcu(&cfs_rq->throttled_list);
4896 raw_spin_unlock(&cfs_b->lock);
4897
4898
4899 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
4900
4901 if (!cfs_rq->load.weight)
4902 return;
4903
4904 task_delta = cfs_rq->h_nr_running;
4905 idle_task_delta = cfs_rq->idle_h_nr_running;
4906 for_each_sched_entity(se) {
4907 if (se->on_rq)
4908 break;
4909 cfs_rq = cfs_rq_of(se);
4910 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
4911
4912 cfs_rq->h_nr_running += task_delta;
4913 cfs_rq->idle_h_nr_running += idle_task_delta;
4914
4915
4916 if (cfs_rq_throttled(cfs_rq))
4917 goto unthrottle_throttle;
4918 }
4919
4920 for_each_sched_entity(se) {
4921 cfs_rq = cfs_rq_of(se);
4922
4923 update_load_avg(cfs_rq, se, UPDATE_TG);
4924 se_update_runnable(se);
4925
4926 cfs_rq->h_nr_running += task_delta;
4927 cfs_rq->idle_h_nr_running += idle_task_delta;
4928
4929
4930
4931 if (cfs_rq_throttled(cfs_rq))
4932 goto unthrottle_throttle;
4933
4934
4935
4936
4937
4938 if (throttled_hierarchy(cfs_rq))
4939 list_add_leaf_cfs_rq(cfs_rq);
4940 }
4941
4942
4943 add_nr_running(rq, task_delta);
4944
4945unthrottle_throttle:
4946
4947
4948
4949
4950
4951 for_each_sched_entity(se) {
4952 cfs_rq = cfs_rq_of(se);
4953
4954 if (list_add_leaf_cfs_rq(cfs_rq))
4955 break;
4956 }
4957
4958 assert_list_leaf_cfs_rq(rq);
4959
4960
4961 if (rq->curr == rq->idle && rq->cfs.nr_running)
4962 resched_curr(rq);
4963}
4964
4965static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
4966{
4967 struct cfs_rq *cfs_rq;
4968 u64 runtime, remaining = 1;
4969
4970 rcu_read_lock();
4971 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
4972 throttled_list) {
4973 struct rq *rq = rq_of(cfs_rq);
4974 struct rq_flags rf;
4975
4976 rq_lock_irqsave(rq, &rf);
4977 if (!cfs_rq_throttled(cfs_rq))
4978 goto next;
4979
4980
4981 SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
4982
4983 raw_spin_lock(&cfs_b->lock);
4984 runtime = -cfs_rq->runtime_remaining + 1;
4985 if (runtime > cfs_b->runtime)
4986 runtime = cfs_b->runtime;
4987 cfs_b->runtime -= runtime;
4988 remaining = cfs_b->runtime;
4989 raw_spin_unlock(&cfs_b->lock);
4990
4991 cfs_rq->runtime_remaining += runtime;
4992
4993
4994 if (cfs_rq->runtime_remaining > 0)
4995 unthrottle_cfs_rq(cfs_rq);
4996
4997next:
4998 rq_unlock_irqrestore(rq, &rf);
4999
5000 if (!remaining)
5001 break;
5002 }
5003 rcu_read_unlock();
5004}
5005
5006
5007
5008
5009
5010
5011
5012static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
5013{
5014 int throttled;
5015
5016
5017 if (cfs_b->quota == RUNTIME_INF)
5018 goto out_deactivate;
5019
5020 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
5021 cfs_b->nr_periods += overrun;
5022
5023
5024 __refill_cfs_bandwidth_runtime(cfs_b);
5025
5026
5027
5028
5029
5030 if (cfs_b->idle && !throttled)
5031 goto out_deactivate;
5032
5033 if (!throttled) {
5034
5035 cfs_b->idle = 1;
5036 return 0;
5037 }
5038
5039
5040 cfs_b->nr_throttled += overrun;
5041
5042
5043
5044
5045 while (throttled && cfs_b->runtime > 0) {
5046 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
5047
5048 distribute_cfs_runtime(cfs_b);
5049 raw_spin_lock_irqsave(&cfs_b->lock, flags);
5050
5051 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
5052 }
5053
5054
5055
5056
5057
5058
5059
5060 cfs_b->idle = 0;
5061
5062 return 0;
5063
5064out_deactivate:
5065 return 1;
5066}
5067
5068
5069static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
5070
5071static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
5072
5073static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
5074
5075
5076
5077
5078
5079
5080
5081
5082static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
5083{
5084 struct hrtimer *refresh_timer = &cfs_b->period_timer;
5085 s64 remaining;
5086
5087
5088 if (hrtimer_callback_running(refresh_timer))
5089 return 1;
5090
5091
5092 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
5093 if (remaining < (s64)min_expire)
5094 return 1;
5095
5096 return 0;
5097}
5098
5099static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
5100{
5101 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
5102
5103
5104 if (runtime_refresh_within(cfs_b, min_left))
5105 return;
5106
5107
5108 if (cfs_b->slack_started)
5109 return;
5110 cfs_b->slack_started = true;
5111
5112 hrtimer_start(&cfs_b->slack_timer,
5113 ns_to_ktime(cfs_bandwidth_slack_period),
5114 HRTIMER_MODE_REL);
5115}
5116
5117
5118static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5119{
5120 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5121 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
5122
5123 if (slack_runtime <= 0)
5124 return;
5125
5126 raw_spin_lock(&cfs_b->lock);
5127 if (cfs_b->quota != RUNTIME_INF) {
5128 cfs_b->runtime += slack_runtime;
5129
5130
5131 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
5132 !list_empty(&cfs_b->throttled_cfs_rq))
5133 start_cfs_slack_bandwidth(cfs_b);
5134 }
5135 raw_spin_unlock(&cfs_b->lock);
5136
5137
5138 cfs_rq->runtime_remaining -= slack_runtime;
5139}
5140
5141static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5142{
5143 if (!cfs_bandwidth_used())
5144 return;
5145
5146 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
5147 return;
5148
5149 __return_cfs_rq_runtime(cfs_rq);
5150}
5151
5152
5153
5154
5155
5156static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
5157{
5158 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
5159 unsigned long flags;
5160
5161
5162 raw_spin_lock_irqsave(&cfs_b->lock, flags);
5163 cfs_b->slack_started = false;
5164
5165 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
5166 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
5167 return;
5168 }
5169
5170 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
5171 runtime = cfs_b->runtime;
5172
5173 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
5174
5175 if (!runtime)
5176 return;
5177
5178 distribute_cfs_runtime(cfs_b);
5179}
5180
5181
5182
5183
5184
5185
5186static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
5187{
5188 if (!cfs_bandwidth_used())
5189 return;
5190
5191
5192 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
5193 return;
5194
5195
5196 if (cfs_rq_throttled(cfs_rq))
5197 return;
5198
5199
5200 account_cfs_rq_runtime(cfs_rq, 0);
5201 if (cfs_rq->runtime_remaining <= 0)
5202 throttle_cfs_rq(cfs_rq);
5203}
5204
5205static void sync_throttle(struct task_group *tg, int cpu)
5206{
5207 struct cfs_rq *pcfs_rq, *cfs_rq;
5208
5209 if (!cfs_bandwidth_used())
5210 return;
5211
5212 if (!tg->parent)
5213 return;
5214
5215 cfs_rq = tg->cfs_rq[cpu];
5216 pcfs_rq = tg->parent->cfs_rq[cpu];
5217
5218 cfs_rq->throttle_count = pcfs_rq->throttle_count;
5219 cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
5220}
5221
5222
5223static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5224{
5225 if (!cfs_bandwidth_used())
5226 return false;
5227
5228 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
5229 return false;
5230
5231
5232
5233
5234
5235 if (cfs_rq_throttled(cfs_rq))
5236 return true;
5237
5238 return throttle_cfs_rq(cfs_rq);
5239}
5240
5241static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
5242{
5243 struct cfs_bandwidth *cfs_b =
5244 container_of(timer, struct cfs_bandwidth, slack_timer);
5245
5246 do_sched_cfs_slack_timer(cfs_b);
5247
5248 return HRTIMER_NORESTART;
5249}
5250
5251extern const u64 max_cfs_quota_period;
5252
5253static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
5254{
5255 struct cfs_bandwidth *cfs_b =
5256 container_of(timer, struct cfs_bandwidth, period_timer);
5257 unsigned long flags;
5258 int overrun;
5259 int idle = 0;
5260 int count = 0;
5261
5262 raw_spin_lock_irqsave(&cfs_b->lock, flags);
5263 for (;;) {
5264 overrun = hrtimer_forward_now(timer, cfs_b->period);
5265 if (!overrun)
5266 break;
5267
5268 idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
5269
5270 if (++count > 3) {
5271 u64 new, old = ktime_to_ns(cfs_b->period);
5272
5273
5274
5275
5276
5277
5278 new = old * 2;
5279 if (new < max_cfs_quota_period) {
5280 cfs_b->period = ns_to_ktime(new);
5281 cfs_b->quota *= 2;
5282 cfs_b->burst *= 2;
5283
5284 pr_warn_ratelimited(
5285 "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
5286 smp_processor_id(),
5287 div_u64(new, NSEC_PER_USEC),
5288 div_u64(cfs_b->quota, NSEC_PER_USEC));
5289 } else {
5290 pr_warn_ratelimited(
5291 "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
5292 smp_processor_id(),
5293 div_u64(old, NSEC_PER_USEC),
5294 div_u64(cfs_b->quota, NSEC_PER_USEC));
5295 }
5296
5297
5298 count = 0;
5299 }
5300 }
5301 if (idle)
5302 cfs_b->period_active = 0;
5303 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
5304
5305 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
5306}
5307
5308void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5309{
5310 raw_spin_lock_init(&cfs_b->lock);
5311 cfs_b->runtime = 0;
5312 cfs_b->quota = RUNTIME_INF;
5313 cfs_b->period = ns_to_ktime(default_cfs_period());
5314 cfs_b->burst = 0;
5315
5316 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
5317 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
5318 cfs_b->period_timer.function = sched_cfs_period_timer;
5319 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5320 cfs_b->slack_timer.function = sched_cfs_slack_timer;
5321 cfs_b->slack_started = false;
5322}
5323
5324static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5325{
5326 cfs_rq->runtime_enabled = 0;
5327 INIT_LIST_HEAD(&cfs_rq->throttled_list);
5328}
5329
5330void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5331{
5332 lockdep_assert_held(&cfs_b->lock);
5333
5334 if (cfs_b->period_active)
5335 return;
5336
5337 cfs_b->period_active = 1;
5338 hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
5339 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
5340}
5341
5342static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5343{
5344
5345 if (!cfs_b->throttled_cfs_rq.next)
5346 return;
5347
5348 hrtimer_cancel(&cfs_b->period_timer);
5349 hrtimer_cancel(&cfs_b->slack_timer);
5350}
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360static void __maybe_unused update_runtime_enabled(struct rq *rq)
5361{
5362 struct task_group *tg;
5363
5364 lockdep_assert_rq_held(rq);
5365
5366 rcu_read_lock();
5367 list_for_each_entry_rcu(tg, &task_groups, list) {
5368 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
5369 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5370
5371 raw_spin_lock(&cfs_b->lock);
5372 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
5373 raw_spin_unlock(&cfs_b->lock);
5374 }
5375 rcu_read_unlock();
5376}
5377
5378
5379static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
5380{
5381 struct task_group *tg;
5382
5383 lockdep_assert_rq_held(rq);
5384
5385 rcu_read_lock();
5386 list_for_each_entry_rcu(tg, &task_groups, list) {
5387 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5388
5389 if (!cfs_rq->runtime_enabled)
5390 continue;
5391
5392
5393
5394
5395
5396 cfs_rq->runtime_remaining = 1;
5397
5398
5399
5400
5401 cfs_rq->runtime_enabled = 0;
5402
5403 if (cfs_rq_throttled(cfs_rq))
5404 unthrottle_cfs_rq(cfs_rq);
5405 }
5406 rcu_read_unlock();
5407}
5408
5409#else
5410
5411static inline bool cfs_bandwidth_used(void)
5412{
5413 return false;
5414}
5415
5416static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
5417static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
5418static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
5419static inline void sync_throttle(struct task_group *tg, int cpu) {}
5420static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5421
5422static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5423{
5424 return 0;
5425}
5426
5427static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5428{
5429 return 0;
5430}
5431
5432static inline int throttled_lb_pair(struct task_group *tg,
5433 int src_cpu, int dest_cpu)
5434{
5435 return 0;
5436}
5437
5438void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5439
5440#ifdef CONFIG_FAIR_GROUP_SCHED
5441static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5442#endif
5443
5444static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
5445{
5446 return NULL;
5447}
5448static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5449static inline void update_runtime_enabled(struct rq *rq) {}
5450static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
5451
5452#endif
5453
5454
5455
5456
5457
5458#ifdef CONFIG_SCHED_HRTICK
5459static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
5460{
5461 struct sched_entity *se = &p->se;
5462 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5463
5464 SCHED_WARN_ON(task_rq(p) != rq);
5465
5466 if (rq->cfs.h_nr_running > 1) {
5467 u64 slice = sched_slice(cfs_rq, se);
5468 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
5469 s64 delta = slice - ran;
5470
5471 if (delta < 0) {
5472 if (task_current(rq, p))
5473 resched_curr(rq);
5474 return;
5475 }
5476 hrtick_start(rq, delta);
5477 }
5478}
5479
5480
5481
5482
5483
5484
5485static void hrtick_update(struct rq *rq)
5486{
5487 struct task_struct *curr = rq->curr;
5488
5489 if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class)
5490 return;
5491
5492 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
5493 hrtick_start_fair(rq, curr);
5494}
5495#else
5496static inline void
5497hrtick_start_fair(struct rq *rq, struct task_struct *p)
5498{
5499}
5500
5501static inline void hrtick_update(struct rq *rq)
5502{
5503}
5504#endif
5505
5506#ifdef CONFIG_SMP
5507static inline unsigned long cpu_util(int cpu);
5508
5509static inline bool cpu_overutilized(int cpu)
5510{
5511 return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
5512}
5513
5514static inline void update_overutilized_status(struct rq *rq)
5515{
5516 if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
5517 WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
5518 trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
5519 }
5520}
5521#else
5522static inline void update_overutilized_status(struct rq *rq) { }
5523#endif
5524
5525
5526static int sched_idle_rq(struct rq *rq)
5527{
5528 return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
5529 rq->nr_running);
5530}
5531
5532#ifdef CONFIG_SMP
5533static int sched_idle_cpu(int cpu)
5534{
5535 return sched_idle_rq(cpu_rq(cpu));
5536}
5537#endif
5538
5539
5540
5541
5542
5543
5544static void
5545enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5546{
5547 struct cfs_rq *cfs_rq;
5548 struct sched_entity *se = &p->se;
5549 int idle_h_nr_running = task_has_idle_policy(p);
5550 int task_new = !(flags & ENQUEUE_WAKEUP);
5551
5552
5553
5554
5555
5556
5557
5558 util_est_enqueue(&rq->cfs, p);
5559
5560
5561
5562
5563
5564
5565 if (p->in_iowait)
5566 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
5567
5568 for_each_sched_entity(se) {
5569 if (se->on_rq)
5570 break;
5571 cfs_rq = cfs_rq_of(se);
5572 enqueue_entity(cfs_rq, se, flags);
5573
5574 cfs_rq->h_nr_running++;
5575 cfs_rq->idle_h_nr_running += idle_h_nr_running;
5576
5577
5578 if (cfs_rq_throttled(cfs_rq))
5579 goto enqueue_throttle;
5580
5581 flags = ENQUEUE_WAKEUP;
5582 }
5583
5584 for_each_sched_entity(se) {
5585 cfs_rq = cfs_rq_of(se);
5586
5587 update_load_avg(cfs_rq, se, UPDATE_TG);
5588 se_update_runnable(se);
5589 update_cfs_group(se);
5590
5591 cfs_rq->h_nr_running++;
5592 cfs_rq->idle_h_nr_running += idle_h_nr_running;
5593
5594
5595 if (cfs_rq_throttled(cfs_rq))
5596 goto enqueue_throttle;
5597
5598
5599
5600
5601
5602 if (throttled_hierarchy(cfs_rq))
5603 list_add_leaf_cfs_rq(cfs_rq);
5604 }
5605
5606
5607 add_nr_running(rq, 1);
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623 if (!task_new)
5624 update_overutilized_status(rq);
5625
5626enqueue_throttle:
5627 if (cfs_bandwidth_used()) {
5628
5629
5630
5631
5632
5633
5634 for_each_sched_entity(se) {
5635 cfs_rq = cfs_rq_of(se);
5636
5637 if (list_add_leaf_cfs_rq(cfs_rq))
5638 break;
5639 }
5640 }
5641
5642 assert_list_leaf_cfs_rq(rq);
5643
5644 hrtick_update(rq);
5645}
5646
5647static void set_next_buddy(struct sched_entity *se);
5648
5649
5650
5651
5652
5653
5654static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5655{
5656 struct cfs_rq *cfs_rq;
5657 struct sched_entity *se = &p->se;
5658 int task_sleep = flags & DEQUEUE_SLEEP;
5659 int idle_h_nr_running = task_has_idle_policy(p);
5660 bool was_sched_idle = sched_idle_rq(rq);
5661
5662 util_est_dequeue(&rq->cfs, p);
5663
5664 for_each_sched_entity(se) {
5665 cfs_rq = cfs_rq_of(se);
5666 dequeue_entity(cfs_rq, se, flags);
5667
5668 cfs_rq->h_nr_running--;
5669 cfs_rq->idle_h_nr_running -= idle_h_nr_running;
5670
5671
5672 if (cfs_rq_throttled(cfs_rq))
5673 goto dequeue_throttle;
5674
5675
5676 if (cfs_rq->load.weight) {
5677
5678 se = parent_entity(se);
5679
5680
5681
5682
5683 if (task_sleep && se && !throttled_hierarchy(cfs_rq))
5684 set_next_buddy(se);
5685 break;
5686 }
5687 flags |= DEQUEUE_SLEEP;
5688 }
5689
5690 for_each_sched_entity(se) {
5691 cfs_rq = cfs_rq_of(se);
5692
5693 update_load_avg(cfs_rq, se, UPDATE_TG);
5694 se_update_runnable(se);
5695 update_cfs_group(se);
5696
5697 cfs_rq->h_nr_running--;
5698 cfs_rq->idle_h_nr_running -= idle_h_nr_running;
5699
5700
5701 if (cfs_rq_throttled(cfs_rq))
5702 goto dequeue_throttle;
5703
5704 }
5705
5706
5707 sub_nr_running(rq, 1);
5708
5709
5710 if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
5711 rq->next_balance = jiffies;
5712
5713dequeue_throttle:
5714 util_est_update(&rq->cfs, p, task_sleep);
5715 hrtick_update(rq);
5716}
5717
5718#ifdef CONFIG_SMP
5719
5720
5721DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
5722DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
5723
5724#ifdef CONFIG_NO_HZ_COMMON
5725
5726static struct {
5727 cpumask_var_t idle_cpus_mask;
5728 atomic_t nr_cpus;
5729 int has_blocked;
5730 unsigned long next_balance;
5731 unsigned long next_blocked;
5732} nohz ____cacheline_aligned;
5733
5734#endif
5735
5736static unsigned long cpu_load(struct rq *rq)
5737{
5738 return cfs_rq_load_avg(&rq->cfs);
5739}
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
5755{
5756 struct cfs_rq *cfs_rq;
5757 unsigned int load;
5758
5759
5760 if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
5761 return cpu_load(rq);
5762
5763 cfs_rq = &rq->cfs;
5764 load = READ_ONCE(cfs_rq->avg.load_avg);
5765
5766
5767 lsub_positive(&load, task_h_load(p));
5768
5769 return load;
5770}
5771
5772static unsigned long cpu_runnable(struct rq *rq)
5773{
5774 return cfs_rq_runnable_avg(&rq->cfs);
5775}
5776
5777static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p)
5778{
5779 struct cfs_rq *cfs_rq;
5780 unsigned int runnable;
5781
5782
5783 if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
5784 return cpu_runnable(rq);
5785
5786 cfs_rq = &rq->cfs;
5787 runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
5788
5789
5790 lsub_positive(&runnable, p->se.avg.runnable_avg);
5791
5792 return runnable;
5793}
5794
5795static unsigned long capacity_of(int cpu)
5796{
5797 return cpu_rq(cpu)->cpu_capacity;
5798}
5799
5800static void record_wakee(struct task_struct *p)
5801{
5802
5803
5804
5805
5806 if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
5807 current->wakee_flips >>= 1;
5808 current->wakee_flip_decay_ts = jiffies;
5809 }
5810
5811 if (current->last_wakee != p) {
5812 current->last_wakee = p;
5813 current->wakee_flips++;
5814 }
5815}
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834static int wake_wide(struct task_struct *p)
5835{
5836 unsigned int master = current->wakee_flips;
5837 unsigned int slave = p->wakee_flips;
5838 int factor = __this_cpu_read(sd_llc_size);
5839
5840 if (master < slave)
5841 swap(master, slave);
5842 if (slave < factor || master < slave * factor)
5843 return 0;
5844 return 1;
5845}
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859static int
5860wake_affine_idle(int this_cpu, int prev_cpu, int sync)
5861{
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874 if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
5875 return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
5876
5877 if (sync && cpu_rq(this_cpu)->nr_running == 1)
5878 return this_cpu;
5879
5880 if (available_idle_cpu(prev_cpu))
5881 return prev_cpu;
5882
5883 return nr_cpumask_bits;
5884}
5885
5886static int
5887wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5888 int this_cpu, int prev_cpu, int sync)
5889{
5890 s64 this_eff_load, prev_eff_load;
5891 unsigned long task_load;
5892
5893 this_eff_load = cpu_load(cpu_rq(this_cpu));
5894
5895 if (sync) {
5896 unsigned long current_load = task_h_load(current);
5897
5898 if (current_load > this_eff_load)
5899 return this_cpu;
5900
5901 this_eff_load -= current_load;
5902 }
5903
5904 task_load = task_h_load(p);
5905
5906 this_eff_load += task_load;
5907 if (sched_feat(WA_BIAS))
5908 this_eff_load *= 100;
5909 this_eff_load *= capacity_of(prev_cpu);
5910
5911 prev_eff_load = cpu_load(cpu_rq(prev_cpu));
5912 prev_eff_load -= task_load;
5913 if (sched_feat(WA_BIAS))
5914 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
5915 prev_eff_load *= capacity_of(this_cpu);
5916
5917
5918
5919
5920
5921
5922
5923 if (sync)
5924 prev_eff_load += 1;
5925
5926 return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
5927}
5928
5929static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5930 int this_cpu, int prev_cpu, int sync)
5931{
5932 int target = nr_cpumask_bits;
5933
5934 if (sched_feat(WA_IDLE))
5935 target = wake_affine_idle(this_cpu, prev_cpu, sync);
5936
5937 if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
5938 target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
5939
5940 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
5941 if (target == nr_cpumask_bits)
5942 return prev_cpu;
5943
5944 schedstat_inc(sd->ttwu_move_affine);
5945 schedstat_inc(p->se.statistics.nr_wakeups_affine);
5946 return target;
5947}
5948
5949static struct sched_group *
5950find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
5951
5952
5953
5954
5955static int
5956find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
5957{
5958 unsigned long load, min_load = ULONG_MAX;
5959 unsigned int min_exit_latency = UINT_MAX;
5960 u64 latest_idle_timestamp = 0;
5961 int least_loaded_cpu = this_cpu;
5962 int shallowest_idle_cpu = -1;
5963 int i;
5964
5965
5966 if (group->group_weight == 1)
5967 return cpumask_first(sched_group_span(group));
5968
5969
5970 for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
5971 struct rq *rq = cpu_rq(i);
5972
5973 if (!sched_core_cookie_match(rq, p))
5974 continue;
5975
5976 if (sched_idle_cpu(i))
5977 return i;
5978
5979 if (available_idle_cpu(i)) {
5980 struct cpuidle_state *idle = idle_get_state(rq);
5981 if (idle && idle->exit_latency < min_exit_latency) {
5982
5983
5984
5985
5986
5987 min_exit_latency = idle->exit_latency;
5988 latest_idle_timestamp = rq->idle_stamp;
5989 shallowest_idle_cpu = i;
5990 } else if ((!idle || idle->exit_latency == min_exit_latency) &&
5991 rq->idle_stamp > latest_idle_timestamp) {
5992
5993
5994
5995
5996
5997 latest_idle_timestamp = rq->idle_stamp;
5998 shallowest_idle_cpu = i;
5999 }
6000 } else if (shallowest_idle_cpu == -1) {
6001 load = cpu_load(cpu_rq(i));
6002 if (load < min_load) {
6003 min_load = load;
6004 least_loaded_cpu = i;
6005 }
6006 }
6007 }
6008
6009 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
6010}
6011
6012static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
6013 int cpu, int prev_cpu, int sd_flag)
6014{
6015 int new_cpu = cpu;
6016
6017 if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
6018 return prev_cpu;
6019
6020
6021
6022
6023
6024 if (!(sd_flag & SD_BALANCE_FORK))
6025 sync_entity_load_avg(&p->se);
6026
6027 while (sd) {
6028 struct sched_group *group;
6029 struct sched_domain *tmp;
6030 int weight;
6031
6032 if (!(sd->flags & sd_flag)) {
6033 sd = sd->child;
6034 continue;
6035 }
6036
6037 group = find_idlest_group(sd, p, cpu);
6038 if (!group) {
6039 sd = sd->child;
6040 continue;
6041 }
6042
6043 new_cpu = find_idlest_group_cpu(group, p, cpu);
6044 if (new_cpu == cpu) {
6045
6046 sd = sd->child;
6047 continue;
6048 }
6049
6050
6051 cpu = new_cpu;
6052 weight = sd->span_weight;
6053 sd = NULL;
6054 for_each_domain(cpu, tmp) {
6055 if (weight <= tmp->span_weight)
6056 break;
6057 if (tmp->flags & sd_flag)
6058 sd = tmp;
6059 }
6060 }
6061
6062 return new_cpu;
6063}
6064
6065static inline int __select_idle_cpu(int cpu, struct task_struct *p)
6066{
6067 if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
6068 sched_cpu_cookie_match(cpu_rq(cpu), p))
6069 return cpu;
6070
6071 return -1;
6072}
6073
6074#ifdef CONFIG_SCHED_SMT
6075DEFINE_STATIC_KEY_FALSE(sched_smt_present);
6076EXPORT_SYMBOL_GPL(sched_smt_present);
6077
6078static inline void set_idle_cores(int cpu, int val)
6079{
6080 struct sched_domain_shared *sds;
6081
6082 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
6083 if (sds)
6084 WRITE_ONCE(sds->has_idle_cores, val);
6085}
6086
6087static inline bool test_idle_cores(int cpu, bool def)
6088{
6089 struct sched_domain_shared *sds;
6090
6091 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
6092 if (sds)
6093 return READ_ONCE(sds->has_idle_cores);
6094
6095 return def;
6096}
6097
6098
6099
6100
6101
6102
6103
6104
6105void __update_idle_core(struct rq *rq)
6106{
6107 int core = cpu_of(rq);
6108 int cpu;
6109
6110 rcu_read_lock();
6111 if (test_idle_cores(core, true))
6112 goto unlock;
6113
6114 for_each_cpu(cpu, cpu_smt_mask(core)) {
6115 if (cpu == core)
6116 continue;
6117
6118 if (!available_idle_cpu(cpu))
6119 goto unlock;
6120 }
6121
6122 set_idle_cores(core, 1);
6123unlock:
6124 rcu_read_unlock();
6125}
6126
6127
6128
6129
6130
6131
6132static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
6133{
6134 bool idle = true;
6135 int cpu;
6136
6137 if (!static_branch_likely(&sched_smt_present))
6138 return __select_idle_cpu(core, p);
6139
6140 for_each_cpu(cpu, cpu_smt_mask(core)) {
6141 if (!available_idle_cpu(cpu)) {
6142 idle = false;
6143 if (*idle_cpu == -1) {
6144 if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) {
6145 *idle_cpu = cpu;
6146 break;
6147 }
6148 continue;
6149 }
6150 break;
6151 }
6152 if (*idle_cpu == -1 && cpumask_test_cpu(cpu, p->cpus_ptr))
6153 *idle_cpu = cpu;
6154 }
6155
6156 if (idle)
6157 return core;
6158
6159 cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
6160 return -1;
6161}
6162
6163
6164
6165
6166static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
6167{
6168 int cpu;
6169
6170 for_each_cpu(cpu, cpu_smt_mask(target)) {
6171 if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
6172 !cpumask_test_cpu(cpu, sched_domain_span(sd)))
6173 continue;
6174 if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
6175 return cpu;
6176 }
6177
6178 return -1;
6179}
6180
6181#else
6182
6183static inline void set_idle_cores(int cpu, int val)
6184{
6185}
6186
6187static inline bool test_idle_cores(int cpu, bool def)
6188{
6189 return def;
6190}
6191
6192static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
6193{
6194 return __select_idle_cpu(core, p);
6195}
6196
6197static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
6198{
6199 return -1;
6200}
6201
6202#endif
6203
6204
6205
6206
6207
6208
6209static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target)
6210{
6211 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
6212 int i, cpu, idle_cpu = -1, nr = INT_MAX;
6213 struct rq *this_rq = this_rq();
6214 int this = smp_processor_id();
6215 struct sched_domain *this_sd;
6216 u64 time = 0;
6217
6218 this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
6219 if (!this_sd)
6220 return -1;
6221
6222 cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
6223
6224 if (sched_feat(SIS_PROP) && !has_idle_core) {
6225 u64 avg_cost, avg_idle, span_avg;
6226 unsigned long now = jiffies;
6227
6228
6229
6230
6231
6232
6233 if (unlikely(this_rq->wake_stamp < now)) {
6234 while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) {
6235 this_rq->wake_stamp++;
6236 this_rq->wake_avg_idle >>= 1;
6237 }
6238 }
6239
6240 avg_idle = this_rq->wake_avg_idle;
6241 avg_cost = this_sd->avg_scan_cost + 1;
6242
6243 span_avg = sd->span_weight * avg_idle;
6244 if (span_avg > 4*avg_cost)
6245 nr = div_u64(span_avg, avg_cost);
6246 else
6247 nr = 4;
6248
6249 time = cpu_clock(this);
6250 }
6251
6252 for_each_cpu_wrap(cpu, cpus, target) {
6253 if (has_idle_core) {
6254 i = select_idle_core(p, cpu, cpus, &idle_cpu);
6255 if ((unsigned int)i < nr_cpumask_bits)
6256 return i;
6257
6258 } else {
6259 if (!--nr)
6260 return -1;
6261 idle_cpu = __select_idle_cpu(cpu, p);
6262 if ((unsigned int)idle_cpu < nr_cpumask_bits)
6263 break;
6264 }
6265 }
6266
6267 if (has_idle_core)
6268 set_idle_cores(target, false);
6269
6270 if (sched_feat(SIS_PROP) && !has_idle_core) {
6271 time = cpu_clock(this) - time;
6272
6273
6274
6275
6276
6277 this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time);
6278
6279 update_avg(&this_sd->avg_scan_cost, time);
6280 }
6281
6282 return idle_cpu;
6283}
6284
6285
6286
6287
6288
6289
6290static int
6291select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
6292{
6293 unsigned long task_util, best_cap = 0;
6294 int cpu, best_cpu = -1;
6295 struct cpumask *cpus;
6296
6297 cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
6298 cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
6299
6300 task_util = uclamp_task_util(p);
6301
6302 for_each_cpu_wrap(cpu, cpus, target) {
6303 unsigned long cpu_cap = capacity_of(cpu);
6304
6305 if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
6306 continue;
6307 if (fits_capacity(task_util, cpu_cap))
6308 return cpu;
6309
6310 if (cpu_cap > best_cap) {
6311 best_cap = cpu_cap;
6312 best_cpu = cpu;
6313 }
6314 }
6315
6316 return best_cpu;
6317}
6318
6319static inline bool asym_fits_capacity(int task_util, int cpu)
6320{
6321 if (static_branch_unlikely(&sched_asym_cpucapacity))
6322 return fits_capacity(task_util, capacity_of(cpu));
6323
6324 return true;
6325}
6326
6327
6328
6329
6330static int select_idle_sibling(struct task_struct *p, int prev, int target)
6331{
6332 bool has_idle_core = false;
6333 struct sched_domain *sd;
6334 unsigned long task_util;
6335 int i, recent_used_cpu;
6336
6337
6338
6339
6340
6341 if (static_branch_unlikely(&sched_asym_cpucapacity)) {
6342 sync_entity_load_avg(&p->se);
6343 task_util = uclamp_task_util(p);
6344 }
6345
6346
6347
6348
6349 lockdep_assert_irqs_disabled();
6350
6351 if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
6352 asym_fits_capacity(task_util, target))
6353 return target;
6354
6355
6356
6357
6358 if (prev != target && cpus_share_cache(prev, target) &&
6359 (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
6360 asym_fits_capacity(task_util, prev))
6361 return prev;
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371 if (is_per_cpu_kthread(current) &&
6372 prev == smp_processor_id() &&
6373 this_rq()->nr_running <= 1) {
6374 return prev;
6375 }
6376
6377
6378 recent_used_cpu = p->recent_used_cpu;
6379 if (recent_used_cpu != prev &&
6380 recent_used_cpu != target &&
6381 cpus_share_cache(recent_used_cpu, target) &&
6382 (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
6383 cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
6384 asym_fits_capacity(task_util, recent_used_cpu)) {
6385
6386
6387
6388
6389 p->recent_used_cpu = prev;
6390 return recent_used_cpu;
6391 }
6392
6393
6394
6395
6396
6397 if (static_branch_unlikely(&sched_asym_cpucapacity)) {
6398 sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
6399
6400
6401
6402
6403
6404
6405
6406
6407 if (sd) {
6408 i = select_idle_capacity(p, sd, target);
6409 return ((unsigned)i < nr_cpumask_bits) ? i : target;
6410 }
6411 }
6412
6413 sd = rcu_dereference(per_cpu(sd_llc, target));
6414 if (!sd)
6415 return target;
6416
6417 if (sched_smt_active()) {
6418 has_idle_core = test_idle_cores(target, false);
6419
6420 if (!has_idle_core && cpus_share_cache(prev, target)) {
6421 i = select_idle_smt(p, sd, prev);
6422 if ((unsigned int)i < nr_cpumask_bits)
6423 return i;
6424 }
6425 }
6426
6427 i = select_idle_cpu(p, sd, has_idle_core, target);
6428 if ((unsigned)i < nr_cpumask_bits)
6429 return i;
6430
6431 return target;
6432}
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472static inline unsigned long cpu_util(int cpu)
6473{
6474 struct cfs_rq *cfs_rq;
6475 unsigned int util;
6476
6477 cfs_rq = &cpu_rq(cpu)->cfs;
6478 util = READ_ONCE(cfs_rq->avg.util_avg);
6479
6480 if (sched_feat(UTIL_EST))
6481 util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
6482
6483 return min_t(unsigned long, util, capacity_orig_of(cpu));
6484}
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499static unsigned long cpu_util_without(int cpu, struct task_struct *p)
6500{
6501 struct cfs_rq *cfs_rq;
6502 unsigned int util;
6503
6504
6505 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
6506 return cpu_util(cpu);
6507
6508 cfs_rq = &cpu_rq(cpu)->cfs;
6509 util = READ_ONCE(cfs_rq->avg.util_avg);
6510
6511
6512 lsub_positive(&util, task_util(p));
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540 if (sched_feat(UTIL_EST)) {
6541 unsigned int estimated =
6542 READ_ONCE(cfs_rq->avg.util_est.enqueued);
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561 if (unlikely(task_on_rq_queued(p) || current == p))
6562 lsub_positive(&estimated, _task_util_est(p));
6563
6564 util = max(util, estimated);
6565 }
6566
6567
6568
6569
6570
6571
6572 return min_t(unsigned long, util, capacity_orig_of(cpu));
6573}
6574
6575
6576
6577
6578
6579static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
6580{
6581 struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
6582 unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
6583
6584
6585
6586
6587
6588
6589
6590 if (task_cpu(p) == cpu && dst_cpu != cpu)
6591 lsub_positive(&util, task_util(p));
6592 else if (task_cpu(p) != cpu && dst_cpu == cpu)
6593 util += task_util(p);
6594
6595 if (sched_feat(UTIL_EST)) {
6596 util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
6597
6598
6599
6600
6601
6602
6603
6604 if (dst_cpu == cpu)
6605 util_est += _task_util_est(p);
6606
6607 util = max(util, util_est);
6608 }
6609
6610 return min(util, capacity_orig_of(cpu));
6611}
6612
6613
6614
6615
6616
6617
6618
6619
6620static long
6621compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
6622{
6623 struct cpumask *pd_mask = perf_domain_span(pd);
6624 unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
6625 unsigned long max_util = 0, sum_util = 0;
6626 unsigned long _cpu_cap = cpu_cap;
6627 int cpu;
6628
6629 _cpu_cap -= arch_scale_thermal_pressure(cpumask_first(pd_mask));
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640 for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
6641 unsigned long util_freq = cpu_util_next(cpu, p, dst_cpu);
6642 unsigned long cpu_util, util_running = util_freq;
6643 struct task_struct *tsk = NULL;
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654 if (cpu == dst_cpu) {
6655 tsk = p;
6656 util_running =
6657 cpu_util_next(cpu, p, -1) + task_util_est(p);
6658 }
6659
6660
6661
6662
6663
6664
6665
6666 cpu_util = effective_cpu_util(cpu, util_running, cpu_cap,
6667 ENERGY_UTIL, NULL);
6668
6669 sum_util += min(cpu_util, _cpu_cap);
6670
6671
6672
6673
6674
6675
6676
6677
6678 cpu_util = effective_cpu_util(cpu, util_freq, cpu_cap,
6679 FREQUENCY_UTIL, tsk);
6680 max_util = max(max_util, min(cpu_util, _cpu_cap));
6681 }
6682
6683 return em_cpu_energy(pd->em_pd, max_util, sum_util, _cpu_cap);
6684}
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
6726{
6727 unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
6728 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
6729 int cpu, best_energy_cpu = prev_cpu, target = -1;
6730 unsigned long cpu_cap, util, base_energy = 0;
6731 struct sched_domain *sd;
6732 struct perf_domain *pd;
6733
6734 rcu_read_lock();
6735 pd = rcu_dereference(rd->pd);
6736 if (!pd || READ_ONCE(rd->overutilized))
6737 goto unlock;
6738
6739
6740
6741
6742
6743 sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
6744 while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
6745 sd = sd->parent;
6746 if (!sd)
6747 goto unlock;
6748
6749 target = prev_cpu;
6750
6751 sync_entity_load_avg(&p->se);
6752 if (!task_util_est(p))
6753 goto unlock;
6754
6755 for (; pd; pd = pd->next) {
6756 unsigned long cur_delta, spare_cap, max_spare_cap = 0;
6757 bool compute_prev_delta = false;
6758 unsigned long base_energy_pd;
6759 int max_spare_cap_cpu = -1;
6760
6761 for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
6762 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
6763 continue;
6764
6765 util = cpu_util_next(cpu, p, cpu);
6766 cpu_cap = capacity_of(cpu);
6767 spare_cap = cpu_cap;
6768 lsub_positive(&spare_cap, util);
6769
6770
6771
6772
6773
6774
6775
6776
6777 util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
6778 if (!fits_capacity(util, cpu_cap))
6779 continue;
6780
6781 if (cpu == prev_cpu) {
6782
6783 compute_prev_delta = true;
6784 } else if (spare_cap > max_spare_cap) {
6785
6786
6787
6788
6789 max_spare_cap = spare_cap;
6790 max_spare_cap_cpu = cpu;
6791 }
6792 }
6793
6794 if (max_spare_cap_cpu < 0 && !compute_prev_delta)
6795 continue;
6796
6797
6798 base_energy_pd = compute_energy(p, -1, pd);
6799 base_energy += base_energy_pd;
6800
6801
6802 if (compute_prev_delta) {
6803 prev_delta = compute_energy(p, prev_cpu, pd);
6804 if (prev_delta < base_energy_pd)
6805 goto unlock;
6806 prev_delta -= base_energy_pd;
6807 best_delta = min(best_delta, prev_delta);
6808 }
6809
6810
6811 if (max_spare_cap_cpu >= 0) {
6812 cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
6813 if (cur_delta < base_energy_pd)
6814 goto unlock;
6815 cur_delta -= base_energy_pd;
6816 if (cur_delta < best_delta) {
6817 best_delta = cur_delta;
6818 best_energy_cpu = max_spare_cap_cpu;
6819 }
6820 }
6821 }
6822 rcu_read_unlock();
6823
6824
6825
6826
6827
6828 if ((prev_delta == ULONG_MAX) ||
6829 (prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
6830 target = best_energy_cpu;
6831
6832 return target;
6833
6834unlock:
6835 rcu_read_unlock();
6836
6837 return target;
6838}
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850static int
6851select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
6852{
6853 int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
6854 struct sched_domain *tmp, *sd = NULL;
6855 int cpu = smp_processor_id();
6856 int new_cpu = prev_cpu;
6857 int want_affine = 0;
6858
6859 int sd_flag = wake_flags & 0xF;
6860
6861
6862
6863
6864 lockdep_assert_held(&p->pi_lock);
6865 if (wake_flags & WF_TTWU) {
6866 record_wakee(p);
6867
6868 if (sched_energy_enabled()) {
6869 new_cpu = find_energy_efficient_cpu(p, prev_cpu);
6870 if (new_cpu >= 0)
6871 return new_cpu;
6872 new_cpu = prev_cpu;
6873 }
6874
6875 want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
6876 }
6877
6878 rcu_read_lock();
6879 for_each_domain(cpu, tmp) {
6880
6881
6882
6883
6884 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
6885 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
6886 if (cpu != prev_cpu)
6887 new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
6888
6889 sd = NULL;
6890 break;
6891 }
6892
6893 if (tmp->flags & sd_flag)
6894 sd = tmp;
6895 else if (!want_affine)
6896 break;
6897 }
6898
6899 if (unlikely(sd)) {
6900
6901 new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
6902 } else if (wake_flags & WF_TTWU) {
6903
6904 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
6905
6906 if (want_affine)
6907 current->recent_used_cpu = cpu;
6908 }
6909 rcu_read_unlock();
6910
6911 return new_cpu;
6912}
6913
6914static void detach_entity_cfs_rq(struct sched_entity *se);
6915
6916
6917
6918
6919
6920
6921static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
6922{
6923
6924
6925
6926
6927
6928
6929 if (READ_ONCE(p->__state) == TASK_WAKING) {
6930 struct sched_entity *se = &p->se;
6931 struct cfs_rq *cfs_rq = cfs_rq_of(se);
6932 u64 min_vruntime;
6933
6934#ifndef CONFIG_64BIT
6935 u64 min_vruntime_copy;
6936
6937 do {
6938 min_vruntime_copy = cfs_rq->min_vruntime_copy;
6939 smp_rmb();
6940 min_vruntime = cfs_rq->min_vruntime;
6941 } while (min_vruntime != min_vruntime_copy);
6942#else
6943 min_vruntime = cfs_rq->min_vruntime;
6944#endif
6945
6946 se->vruntime -= min_vruntime;
6947 }
6948
6949 if (p->on_rq == TASK_ON_RQ_MIGRATING) {
6950
6951
6952
6953
6954 lockdep_assert_rq_held(task_rq(p));
6955 detach_entity_cfs_rq(&p->se);
6956
6957 } else {
6958
6959
6960
6961
6962
6963
6964
6965
6966 remove_entity_load_avg(&p->se);
6967 }
6968
6969
6970 p->se.avg.last_update_time = 0;
6971
6972
6973 p->se.exec_start = 0;
6974
6975 update_scan_period(p, new_cpu);
6976}
6977
6978static void task_dead_fair(struct task_struct *p)
6979{
6980 remove_entity_load_avg(&p->se);
6981}
6982
6983static int
6984balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
6985{
6986 if (rq->nr_running)
6987 return 1;
6988
6989 return newidle_balance(rq, rf) != 0;
6990}
6991#endif
6992
6993static unsigned long wakeup_gran(struct sched_entity *se)
6994{
6995 unsigned long gran = sysctl_sched_wakeup_granularity;
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010 return calc_delta_fair(gran, se);
7011}
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027static int
7028wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
7029{
7030 s64 gran, vdiff = curr->vruntime - se->vruntime;
7031
7032 if (vdiff <= 0)
7033 return -1;
7034
7035 gran = wakeup_gran(se);
7036 if (vdiff > gran)
7037 return 1;
7038
7039 return 0;
7040}
7041
7042static void set_last_buddy(struct sched_entity *se)
7043{
7044 if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
7045 return;
7046
7047 for_each_sched_entity(se) {
7048 if (SCHED_WARN_ON(!se->on_rq))
7049 return;
7050 cfs_rq_of(se)->last = se;
7051 }
7052}
7053
7054static void set_next_buddy(struct sched_entity *se)
7055{
7056 if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
7057 return;
7058
7059 for_each_sched_entity(se) {
7060 if (SCHED_WARN_ON(!se->on_rq))
7061 return;
7062 cfs_rq_of(se)->next = se;
7063 }
7064}
7065
7066static void set_skip_buddy(struct sched_entity *se)
7067{
7068 for_each_sched_entity(se)
7069 cfs_rq_of(se)->skip = se;
7070}
7071
7072
7073
7074
7075static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
7076{
7077 struct task_struct *curr = rq->curr;
7078 struct sched_entity *se = &curr->se, *pse = &p->se;
7079 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
7080 int scale = cfs_rq->nr_running >= sched_nr_latency;
7081 int next_buddy_marked = 0;
7082
7083 if (unlikely(se == pse))
7084 return;
7085
7086
7087
7088
7089
7090
7091
7092 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
7093 return;
7094
7095 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
7096 set_next_buddy(pse);
7097 next_buddy_marked = 1;
7098 }
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110 if (test_tsk_need_resched(curr))
7111 return;
7112
7113
7114 if (unlikely(task_has_idle_policy(curr)) &&
7115 likely(!task_has_idle_policy(p)))
7116 goto preempt;
7117
7118
7119
7120
7121
7122 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
7123 return;
7124
7125 find_matching_se(&se, &pse);
7126 update_curr(cfs_rq_of(se));
7127 BUG_ON(!pse);
7128 if (wakeup_preempt_entity(se, pse) == 1) {
7129
7130
7131
7132
7133 if (!next_buddy_marked)
7134 set_next_buddy(pse);
7135 goto preempt;
7136 }
7137
7138 return;
7139
7140preempt:
7141 resched_curr(rq);
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151 if (unlikely(!se->on_rq || curr == rq->idle))
7152 return;
7153
7154 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
7155 set_last_buddy(se);
7156}
7157
7158#ifdef CONFIG_SMP
7159static struct task_struct *pick_task_fair(struct rq *rq)
7160{
7161 struct sched_entity *se;
7162 struct cfs_rq *cfs_rq;
7163
7164again:
7165 cfs_rq = &rq->cfs;
7166 if (!cfs_rq->nr_running)
7167 return NULL;
7168
7169 do {
7170 struct sched_entity *curr = cfs_rq->curr;
7171
7172
7173 if (curr) {
7174 if (curr->on_rq)
7175 update_curr(cfs_rq);
7176 else
7177 curr = NULL;
7178
7179 if (unlikely(check_cfs_rq_runtime(cfs_rq)))
7180 goto again;
7181 }
7182
7183 se = pick_next_entity(cfs_rq, curr);
7184 cfs_rq = group_cfs_rq(se);
7185 } while (cfs_rq);
7186
7187 return task_of(se);
7188}
7189#endif
7190
7191struct task_struct *
7192pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
7193{
7194 struct cfs_rq *cfs_rq = &rq->cfs;
7195 struct sched_entity *se;
7196 struct task_struct *p;
7197 int new_tasks;
7198
7199again:
7200 if (!sched_fair_runnable(rq))
7201 goto idle;
7202
7203#ifdef CONFIG_FAIR_GROUP_SCHED
7204 if (!prev || prev->sched_class != &fair_sched_class)
7205 goto simple;
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215 do {
7216 struct sched_entity *curr = cfs_rq->curr;
7217
7218
7219
7220
7221
7222
7223
7224 if (curr) {
7225 if (curr->on_rq)
7226 update_curr(cfs_rq);
7227 else
7228 curr = NULL;
7229
7230
7231
7232
7233
7234
7235
7236 if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
7237 cfs_rq = &rq->cfs;
7238
7239 if (!cfs_rq->nr_running)
7240 goto idle;
7241
7242 goto simple;
7243 }
7244 }
7245
7246 se = pick_next_entity(cfs_rq, curr);
7247 cfs_rq = group_cfs_rq(se);
7248 } while (cfs_rq);
7249
7250 p = task_of(se);
7251
7252
7253
7254
7255
7256
7257 if (prev != p) {
7258 struct sched_entity *pse = &prev->se;
7259
7260 while (!(cfs_rq = is_same_group(se, pse))) {
7261 int se_depth = se->depth;
7262 int pse_depth = pse->depth;
7263
7264 if (se_depth <= pse_depth) {
7265 put_prev_entity(cfs_rq_of(pse), pse);
7266 pse = parent_entity(pse);
7267 }
7268 if (se_depth >= pse_depth) {
7269 set_next_entity(cfs_rq_of(se), se);
7270 se = parent_entity(se);
7271 }
7272 }
7273
7274 put_prev_entity(cfs_rq, pse);
7275 set_next_entity(cfs_rq, se);
7276 }
7277
7278 goto done;
7279simple:
7280#endif
7281 if (prev)
7282 put_prev_task(rq, prev);
7283
7284 do {
7285 se = pick_next_entity(cfs_rq, NULL);
7286 set_next_entity(cfs_rq, se);
7287 cfs_rq = group_cfs_rq(se);
7288 } while (cfs_rq);
7289
7290 p = task_of(se);
7291
7292done: __maybe_unused;
7293#ifdef CONFIG_SMP
7294
7295
7296
7297
7298
7299 list_move(&p->se.group_node, &rq->cfs_tasks);
7300#endif
7301
7302 if (hrtick_enabled_fair(rq))
7303 hrtick_start_fair(rq, p);
7304
7305 update_misfit_status(p, rq);
7306
7307 return p;
7308
7309idle:
7310 if (!rf)
7311 return NULL;
7312
7313 new_tasks = newidle_balance(rq, rf);
7314
7315
7316
7317
7318
7319
7320 if (new_tasks < 0)
7321 return RETRY_TASK;
7322
7323 if (new_tasks > 0)
7324 goto again;
7325
7326
7327
7328
7329
7330 update_idle_rq_clock_pelt(rq);
7331
7332 return NULL;
7333}
7334
7335static struct task_struct *__pick_next_task_fair(struct rq *rq)
7336{
7337 return pick_next_task_fair(rq, NULL, NULL);
7338}
7339
7340
7341
7342
7343static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
7344{
7345 struct sched_entity *se = &prev->se;
7346 struct cfs_rq *cfs_rq;
7347
7348 for_each_sched_entity(se) {
7349 cfs_rq = cfs_rq_of(se);
7350 put_prev_entity(cfs_rq, se);
7351 }
7352}
7353
7354
7355
7356
7357
7358
7359static void yield_task_fair(struct rq *rq)
7360{
7361 struct task_struct *curr = rq->curr;
7362 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
7363 struct sched_entity *se = &curr->se;
7364
7365
7366
7367
7368 if (unlikely(rq->nr_running == 1))
7369 return;
7370
7371 clear_buddies(cfs_rq, se);
7372
7373 if (curr->policy != SCHED_BATCH) {
7374 update_rq_clock(rq);
7375
7376
7377
7378 update_curr(cfs_rq);
7379
7380
7381
7382
7383
7384 rq_clock_skip_update(rq);
7385 }
7386
7387 set_skip_buddy(se);
7388}
7389
7390static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
7391{
7392 struct sched_entity *se = &p->se;
7393
7394
7395 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
7396 return false;
7397
7398
7399 set_next_buddy(se);
7400
7401 yield_task_fair(rq);
7402
7403 return true;
7404}
7405
7406#ifdef CONFIG_SMP
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525static unsigned long __read_mostly max_load_balance_interval = HZ/10;
7526
7527enum fbq_type { regular, remote, all };
7528
7529
7530
7531
7532
7533
7534
7535
7536enum group_type {
7537
7538 group_has_spare = 0,
7539
7540
7541
7542
7543 group_fully_busy,
7544
7545
7546
7547
7548 group_misfit_task,
7549
7550
7551
7552
7553
7554 group_asym_packing,
7555
7556
7557
7558
7559 group_imbalanced,
7560
7561
7562
7563
7564 group_overloaded
7565};
7566
7567enum migration_type {
7568 migrate_load = 0,
7569 migrate_util,
7570 migrate_task,
7571 migrate_misfit
7572};
7573
7574#define LBF_ALL_PINNED 0x01
7575#define LBF_NEED_BREAK 0x02
7576#define LBF_DST_PINNED 0x04
7577#define LBF_SOME_PINNED 0x08
7578#define LBF_ACTIVE_LB 0x10
7579
7580struct lb_env {
7581 struct sched_domain *sd;
7582
7583 struct rq *src_rq;
7584 int src_cpu;
7585
7586 int dst_cpu;
7587 struct rq *dst_rq;
7588
7589 struct cpumask *dst_grpmask;
7590 int new_dst_cpu;
7591 enum cpu_idle_type idle;
7592 long imbalance;
7593
7594 struct cpumask *cpus;
7595
7596 unsigned int flags;
7597
7598 unsigned int loop;
7599 unsigned int loop_break;
7600 unsigned int loop_max;
7601
7602 enum fbq_type fbq_type;
7603 enum migration_type migration_type;
7604 struct list_head tasks;
7605};
7606
7607
7608
7609
7610static int task_hot(struct task_struct *p, struct lb_env *env)
7611{
7612 s64 delta;
7613
7614 lockdep_assert_rq_held(env->src_rq);
7615
7616 if (p->sched_class != &fair_sched_class)
7617 return 0;
7618
7619 if (unlikely(task_has_idle_policy(p)))
7620 return 0;
7621
7622
7623 if (env->sd->flags & SD_SHARE_CPUCAPACITY)
7624 return 0;
7625
7626
7627
7628
7629 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
7630 (&p->se == cfs_rq_of(&p->se)->next ||
7631 &p->se == cfs_rq_of(&p->se)->last))
7632 return 1;
7633
7634 if (sysctl_sched_migration_cost == -1)
7635 return 1;
7636
7637
7638
7639
7640
7641 if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p))
7642 return 1;
7643
7644 if (sysctl_sched_migration_cost == 0)
7645 return 0;
7646
7647 delta = rq_clock_task(env->src_rq) - p->se.exec_start;
7648
7649 return delta < (s64)sysctl_sched_migration_cost;
7650}
7651
7652#ifdef CONFIG_NUMA_BALANCING
7653
7654
7655
7656
7657
7658static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
7659{
7660 struct numa_group *numa_group = rcu_dereference(p->numa_group);
7661 unsigned long src_weight, dst_weight;
7662 int src_nid, dst_nid, dist;
7663
7664 if (!static_branch_likely(&sched_numa_balancing))
7665 return -1;
7666
7667 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
7668 return -1;
7669
7670 src_nid = cpu_to_node(env->src_cpu);
7671 dst_nid = cpu_to_node(env->dst_cpu);
7672
7673 if (src_nid == dst_nid)
7674 return -1;
7675
7676
7677 if (src_nid == p->numa_preferred_nid) {
7678 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
7679 return 1;
7680 else
7681 return -1;
7682 }
7683
7684
7685 if (dst_nid == p->numa_preferred_nid)
7686 return 0;
7687
7688
7689 if (env->idle == CPU_IDLE)
7690 return -1;
7691
7692 dist = node_distance(src_nid, dst_nid);
7693 if (numa_group) {
7694 src_weight = group_weight(p, src_nid, dist);
7695 dst_weight = group_weight(p, dst_nid, dist);
7696 } else {
7697 src_weight = task_weight(p, src_nid, dist);
7698 dst_weight = task_weight(p, dst_nid, dist);
7699 }
7700
7701 return dst_weight < src_weight;
7702}
7703
7704#else
7705static inline int migrate_degrades_locality(struct task_struct *p,
7706 struct lb_env *env)
7707{
7708 return -1;
7709}
7710#endif
7711
7712
7713
7714
7715static
7716int can_migrate_task(struct task_struct *p, struct lb_env *env)
7717{
7718 int tsk_cache_hot;
7719
7720 lockdep_assert_rq_held(env->src_rq);
7721
7722
7723
7724
7725
7726
7727
7728
7729 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
7730 return 0;
7731
7732
7733 if (kthread_is_per_cpu(p))
7734 return 0;
7735
7736 if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
7737 int cpu;
7738
7739 schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
7740
7741 env->flags |= LBF_SOME_PINNED;
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753 if (env->idle == CPU_NEWLY_IDLE ||
7754 env->flags & (LBF_DST_PINNED | LBF_ACTIVE_LB))
7755 return 0;
7756
7757
7758 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
7759 if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
7760 env->flags |= LBF_DST_PINNED;
7761 env->new_dst_cpu = cpu;
7762 break;
7763 }
7764 }
7765
7766 return 0;
7767 }
7768
7769
7770 env->flags &= ~LBF_ALL_PINNED;
7771
7772 if (task_running(env->src_rq, p)) {
7773 schedstat_inc(p->se.statistics.nr_failed_migrations_running);
7774 return 0;
7775 }
7776
7777
7778
7779
7780
7781
7782
7783
7784 if (env->flags & LBF_ACTIVE_LB)
7785 return 1;
7786
7787 tsk_cache_hot = migrate_degrades_locality(p, env);
7788 if (tsk_cache_hot == -1)
7789 tsk_cache_hot = task_hot(p, env);
7790
7791 if (tsk_cache_hot <= 0 ||
7792 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
7793 if (tsk_cache_hot == 1) {
7794 schedstat_inc(env->sd->lb_hot_gained[env->idle]);
7795 schedstat_inc(p->se.statistics.nr_forced_migrations);
7796 }
7797 return 1;
7798 }
7799
7800 schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
7801 return 0;
7802}
7803
7804
7805
7806
7807static void detach_task(struct task_struct *p, struct lb_env *env)
7808{
7809 lockdep_assert_rq_held(env->src_rq);
7810
7811 deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
7812 set_task_cpu(p, env->dst_cpu);
7813}
7814
7815
7816
7817
7818
7819
7820
7821static struct task_struct *detach_one_task(struct lb_env *env)
7822{
7823 struct task_struct *p;
7824
7825 lockdep_assert_rq_held(env->src_rq);
7826
7827 list_for_each_entry_reverse(p,
7828 &env->src_rq->cfs_tasks, se.group_node) {
7829 if (!can_migrate_task(p, env))
7830 continue;
7831
7832 detach_task(p, env);
7833
7834
7835
7836
7837
7838
7839
7840 schedstat_inc(env->sd->lb_gained[env->idle]);
7841 return p;
7842 }
7843 return NULL;
7844}
7845
7846static const unsigned int sched_nr_migrate_break = 32;
7847
7848
7849
7850
7851
7852
7853
7854static int detach_tasks(struct lb_env *env)
7855{
7856 struct list_head *tasks = &env->src_rq->cfs_tasks;
7857 unsigned long util, load;
7858 struct task_struct *p;
7859 int detached = 0;
7860
7861 lockdep_assert_rq_held(env->src_rq);
7862
7863
7864
7865
7866
7867 if (env->src_rq->nr_running <= 1) {
7868 env->flags &= ~LBF_ALL_PINNED;
7869 return 0;
7870 }
7871
7872 if (env->imbalance <= 0)
7873 return 0;
7874
7875 while (!list_empty(tasks)) {
7876
7877
7878
7879
7880 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
7881 break;
7882
7883 p = list_last_entry(tasks, struct task_struct, se.group_node);
7884
7885 env->loop++;
7886
7887 if (env->loop > env->loop_max)
7888 break;
7889
7890
7891 if (env->loop > env->loop_break) {
7892 env->loop_break += sched_nr_migrate_break;
7893 env->flags |= LBF_NEED_BREAK;
7894 break;
7895 }
7896
7897 if (!can_migrate_task(p, env))
7898 goto next;
7899
7900 switch (env->migration_type) {
7901 case migrate_load:
7902
7903
7904
7905
7906
7907
7908
7909 load = max_t(unsigned long, task_h_load(p), 1);
7910
7911 if (sched_feat(LB_MIN) &&
7912 load < 16 && !env->sd->nr_balance_failed)
7913 goto next;
7914
7915
7916
7917
7918
7919
7920
7921 if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance)
7922 goto next;
7923
7924 env->imbalance -= load;
7925 break;
7926
7927 case migrate_util:
7928 util = task_util_est(p);
7929
7930 if (util > env->imbalance)
7931 goto next;
7932
7933 env->imbalance -= util;
7934 break;
7935
7936 case migrate_task:
7937 env->imbalance--;
7938 break;
7939
7940 case migrate_misfit:
7941
7942 if (task_fits_capacity(p, capacity_of(env->src_cpu)))
7943 goto next;
7944
7945 env->imbalance = 0;
7946 break;
7947 }
7948
7949 detach_task(p, env);
7950 list_add(&p->se.group_node, &env->tasks);
7951
7952 detached++;
7953
7954#ifdef CONFIG_PREEMPTION
7955
7956
7957
7958
7959
7960 if (env->idle == CPU_NEWLY_IDLE)
7961 break;
7962#endif
7963
7964
7965
7966
7967
7968 if (env->imbalance <= 0)
7969 break;
7970
7971 continue;
7972next:
7973 list_move(&p->se.group_node, tasks);
7974 }
7975
7976
7977
7978
7979
7980
7981 schedstat_add(env->sd->lb_gained[env->idle], detached);
7982
7983 return detached;
7984}
7985
7986
7987
7988
7989static void attach_task(struct rq *rq, struct task_struct *p)
7990{
7991 lockdep_assert_rq_held(rq);
7992
7993 BUG_ON(task_rq(p) != rq);
7994 activate_task(rq, p, ENQUEUE_NOCLOCK);
7995 check_preempt_curr(rq, p, 0);
7996}
7997
7998
7999
8000
8001
8002static void attach_one_task(struct rq *rq, struct task_struct *p)
8003{
8004 struct rq_flags rf;
8005
8006 rq_lock(rq, &rf);
8007 update_rq_clock(rq);
8008 attach_task(rq, p);
8009 rq_unlock(rq, &rf);
8010}
8011
8012
8013
8014
8015
8016static void attach_tasks(struct lb_env *env)
8017{
8018 struct list_head *tasks = &env->tasks;
8019 struct task_struct *p;
8020 struct rq_flags rf;
8021
8022 rq_lock(env->dst_rq, &rf);
8023 update_rq_clock(env->dst_rq);
8024
8025 while (!list_empty(tasks)) {
8026 p = list_first_entry(tasks, struct task_struct, se.group_node);
8027 list_del_init(&p->se.group_node);
8028
8029 attach_task(env->dst_rq, p);
8030 }
8031
8032 rq_unlock(env->dst_rq, &rf);
8033}
8034
8035#ifdef CONFIG_NO_HZ_COMMON
8036static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
8037{
8038 if (cfs_rq->avg.load_avg)
8039 return true;
8040
8041 if (cfs_rq->avg.util_avg)
8042 return true;
8043
8044 return false;
8045}
8046
8047static inline bool others_have_blocked(struct rq *rq)
8048{
8049 if (READ_ONCE(rq->avg_rt.util_avg))
8050 return true;
8051
8052 if (READ_ONCE(rq->avg_dl.util_avg))
8053 return true;
8054
8055 if (thermal_load_avg(rq))
8056 return true;
8057
8058#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
8059 if (READ_ONCE(rq->avg_irq.util_avg))
8060 return true;
8061#endif
8062
8063 return false;
8064}
8065
8066static inline void update_blocked_load_tick(struct rq *rq)
8067{
8068 WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
8069}
8070
8071static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
8072{
8073 if (!has_blocked)
8074 rq->has_blocked_load = 0;
8075}
8076#else
8077static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
8078static inline bool others_have_blocked(struct rq *rq) { return false; }
8079static inline void update_blocked_load_tick(struct rq *rq) {}
8080static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
8081#endif
8082
8083static bool __update_blocked_others(struct rq *rq, bool *done)
8084{
8085 const struct sched_class *curr_class;
8086 u64 now = rq_clock_pelt(rq);
8087 unsigned long thermal_pressure;
8088 bool decayed;
8089
8090
8091
8092
8093
8094 curr_class = rq->curr->sched_class;
8095
8096 thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
8097
8098 decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
8099 update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
8100 update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) |
8101 update_irq_load_avg(rq, 0);
8102
8103 if (others_have_blocked(rq))
8104 *done = false;
8105
8106 return decayed;
8107}
8108
8109#ifdef CONFIG_FAIR_GROUP_SCHED
8110
8111static bool __update_blocked_fair(struct rq *rq, bool *done)
8112{
8113 struct cfs_rq *cfs_rq, *pos;
8114 bool decayed = false;
8115 int cpu = cpu_of(rq);
8116
8117
8118
8119
8120
8121 for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
8122 struct sched_entity *se;
8123
8124 if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
8125 update_tg_load_avg(cfs_rq);
8126
8127 if (cfs_rq == &rq->cfs)
8128 decayed = true;
8129 }
8130
8131
8132 se = cfs_rq->tg->se[cpu];
8133 if (se && !skip_blocked_update(se))
8134 update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
8135
8136
8137
8138
8139
8140 if (cfs_rq_is_decayed(cfs_rq))
8141 list_del_leaf_cfs_rq(cfs_rq);
8142
8143
8144 if (cfs_rq_has_blocked(cfs_rq))
8145 *done = false;
8146 }
8147
8148 return decayed;
8149}
8150
8151
8152
8153
8154
8155
8156static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
8157{
8158 struct rq *rq = rq_of(cfs_rq);
8159 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
8160 unsigned long now = jiffies;
8161 unsigned long load;
8162
8163 if (cfs_rq->last_h_load_update == now)
8164 return;
8165
8166 WRITE_ONCE(cfs_rq->h_load_next, NULL);
8167 for_each_sched_entity(se) {
8168 cfs_rq = cfs_rq_of(se);
8169 WRITE_ONCE(cfs_rq->h_load_next, se);
8170 if (cfs_rq->last_h_load_update == now)
8171 break;
8172 }
8173
8174 if (!se) {
8175 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
8176 cfs_rq->last_h_load_update = now;
8177 }
8178
8179 while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
8180 load = cfs_rq->h_load;
8181 load = div64_ul(load * se->avg.load_avg,
8182 cfs_rq_load_avg(cfs_rq) + 1);
8183 cfs_rq = group_cfs_rq(se);
8184 cfs_rq->h_load = load;
8185 cfs_rq->last_h_load_update = now;
8186 }
8187}
8188
8189static unsigned long task_h_load(struct task_struct *p)
8190{
8191 struct cfs_rq *cfs_rq = task_cfs_rq(p);
8192
8193 update_cfs_rq_h_load(cfs_rq);
8194 return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
8195 cfs_rq_load_avg(cfs_rq) + 1);
8196}
8197#else
8198static bool __update_blocked_fair(struct rq *rq, bool *done)
8199{
8200 struct cfs_rq *cfs_rq = &rq->cfs;
8201 bool decayed;
8202
8203 decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
8204 if (cfs_rq_has_blocked(cfs_rq))
8205 *done = false;
8206
8207 return decayed;
8208}
8209
8210static unsigned long task_h_load(struct task_struct *p)
8211{
8212 return p->se.avg.load_avg;
8213}
8214#endif
8215
8216static void update_blocked_averages(int cpu)
8217{
8218 bool decayed = false, done = true;
8219 struct rq *rq = cpu_rq(cpu);
8220 struct rq_flags rf;
8221
8222 rq_lock_irqsave(rq, &rf);
8223 update_blocked_load_tick(rq);
8224 update_rq_clock(rq);
8225
8226 decayed |= __update_blocked_others(rq, &done);
8227 decayed |= __update_blocked_fair(rq, &done);
8228
8229 update_blocked_load_status(rq, !done);
8230 if (decayed)
8231 cpufreq_update_util(rq, 0);
8232 rq_unlock_irqrestore(rq, &rf);
8233}
8234
8235
8236
8237
8238
8239
8240struct sg_lb_stats {
8241 unsigned long avg_load;
8242 unsigned long group_load;
8243 unsigned long group_capacity;
8244 unsigned long group_util;
8245 unsigned long group_runnable;
8246 unsigned int sum_nr_running;
8247 unsigned int sum_h_nr_running;
8248 unsigned int idle_cpus;
8249 unsigned int group_weight;
8250 enum group_type group_type;
8251 unsigned int group_asym_packing;
8252 unsigned long group_misfit_task_load;
8253#ifdef CONFIG_NUMA_BALANCING
8254 unsigned int nr_numa_running;
8255 unsigned int nr_preferred_running;
8256#endif
8257};
8258
8259
8260
8261
8262
8263struct sd_lb_stats {
8264 struct sched_group *busiest;
8265 struct sched_group *local;
8266 unsigned long total_load;
8267 unsigned long total_capacity;
8268 unsigned long avg_load;
8269 unsigned int prefer_sibling;
8270
8271 struct sg_lb_stats busiest_stat;
8272 struct sg_lb_stats local_stat;
8273};
8274
8275static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
8276{
8277
8278
8279
8280
8281
8282
8283
8284 *sds = (struct sd_lb_stats){
8285 .busiest = NULL,
8286 .local = NULL,
8287 .total_load = 0UL,
8288 .total_capacity = 0UL,
8289 .busiest_stat = {
8290 .idle_cpus = UINT_MAX,
8291 .group_type = group_has_spare,
8292 },
8293 };
8294}
8295
8296static unsigned long scale_rt_capacity(int cpu)
8297{
8298 struct rq *rq = cpu_rq(cpu);
8299 unsigned long max = arch_scale_cpu_capacity(cpu);
8300 unsigned long used, free;
8301 unsigned long irq;
8302
8303 irq = cpu_util_irq(rq);
8304
8305 if (unlikely(irq >= max))
8306 return 1;
8307
8308
8309
8310
8311
8312
8313
8314 used = READ_ONCE(rq->avg_rt.util_avg);
8315 used += READ_ONCE(rq->avg_dl.util_avg);
8316 used += thermal_load_avg(rq);
8317
8318 if (unlikely(used >= max))
8319 return 1;
8320
8321 free = max - used;
8322
8323 return scale_irq_capacity(free, irq, max);
8324}
8325
8326static void update_cpu_capacity(struct sched_domain *sd, int cpu)
8327{
8328 unsigned long capacity = scale_rt_capacity(cpu);
8329 struct sched_group *sdg = sd->groups;
8330
8331 cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
8332
8333 if (!capacity)
8334 capacity = 1;
8335
8336 cpu_rq(cpu)->cpu_capacity = capacity;
8337 trace_sched_cpu_capacity_tp(cpu_rq(cpu));
8338
8339 sdg->sgc->capacity = capacity;
8340 sdg->sgc->min_capacity = capacity;
8341 sdg->sgc->max_capacity = capacity;
8342}
8343
8344void update_group_capacity(struct sched_domain *sd, int cpu)
8345{
8346 struct sched_domain *child = sd->child;
8347 struct sched_group *group, *sdg = sd->groups;
8348 unsigned long capacity, min_capacity, max_capacity;
8349 unsigned long interval;
8350
8351 interval = msecs_to_jiffies(sd->balance_interval);
8352 interval = clamp(interval, 1UL, max_load_balance_interval);
8353 sdg->sgc->next_update = jiffies + interval;
8354
8355 if (!child) {
8356 update_cpu_capacity(sd, cpu);
8357 return;
8358 }
8359
8360 capacity = 0;
8361 min_capacity = ULONG_MAX;
8362 max_capacity = 0;
8363
8364 if (child->flags & SD_OVERLAP) {
8365
8366
8367
8368
8369
8370 for_each_cpu(cpu, sched_group_span(sdg)) {
8371 unsigned long cpu_cap = capacity_of(cpu);
8372
8373 capacity += cpu_cap;
8374 min_capacity = min(cpu_cap, min_capacity);
8375 max_capacity = max(cpu_cap, max_capacity);
8376 }
8377 } else {
8378
8379
8380
8381
8382
8383 group = child->groups;
8384 do {
8385 struct sched_group_capacity *sgc = group->sgc;
8386
8387 capacity += sgc->capacity;
8388 min_capacity = min(sgc->min_capacity, min_capacity);
8389 max_capacity = max(sgc->max_capacity, max_capacity);
8390 group = group->next;
8391 } while (group != child->groups);
8392 }
8393
8394 sdg->sgc->capacity = capacity;
8395 sdg->sgc->min_capacity = min_capacity;
8396 sdg->sgc->max_capacity = max_capacity;
8397}
8398
8399
8400
8401
8402
8403
8404static inline int
8405check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
8406{
8407 return ((rq->cpu_capacity * sd->imbalance_pct) <
8408 (rq->cpu_capacity_orig * 100));
8409}
8410
8411
8412
8413
8414
8415
8416static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
8417{
8418 return rq->misfit_task_load &&
8419 (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
8420 check_cpu_capacity(rq, sd));
8421}
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452static inline int sg_imbalanced(struct sched_group *group)
8453{
8454 return group->sgc->imbalance;
8455}
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469static inline bool
8470group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
8471{
8472 if (sgs->sum_nr_running < sgs->group_weight)
8473 return true;
8474
8475 if ((sgs->group_capacity * imbalance_pct) <
8476 (sgs->group_runnable * 100))
8477 return false;
8478
8479 if ((sgs->group_capacity * 100) >
8480 (sgs->group_util * imbalance_pct))
8481 return true;
8482
8483 return false;
8484}
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494static inline bool
8495group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
8496{
8497 if (sgs->sum_nr_running <= sgs->group_weight)
8498 return false;
8499
8500 if ((sgs->group_capacity * 100) <
8501 (sgs->group_util * imbalance_pct))
8502 return true;
8503
8504 if ((sgs->group_capacity * imbalance_pct) <
8505 (sgs->group_runnable * 100))
8506 return true;
8507
8508 return false;
8509}
8510
8511static inline enum
8512group_type group_classify(unsigned int imbalance_pct,
8513 struct sched_group *group,
8514 struct sg_lb_stats *sgs)
8515{
8516 if (group_is_overloaded(imbalance_pct, sgs))
8517 return group_overloaded;
8518
8519 if (sg_imbalanced(group))
8520 return group_imbalanced;
8521
8522 if (sgs->group_asym_packing)
8523 return group_asym_packing;
8524
8525 if (sgs->group_misfit_task_load)
8526 return group_misfit_task;
8527
8528 if (!group_has_capacity(imbalance_pct, sgs))
8529 return group_fully_busy;
8530
8531 return group_has_spare;
8532}
8533
8534
8535
8536
8537
8538
8539
8540
8541static inline void update_sg_lb_stats(struct lb_env *env,
8542 struct sched_group *group,
8543 struct sg_lb_stats *sgs,
8544 int *sg_status)
8545{
8546 int i, nr_running, local_group;
8547
8548 memset(sgs, 0, sizeof(*sgs));
8549
8550 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
8551
8552 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
8553 struct rq *rq = cpu_rq(i);
8554
8555 sgs->group_load += cpu_load(rq);
8556 sgs->group_util += cpu_util(i);
8557 sgs->group_runnable += cpu_runnable(rq);
8558 sgs->sum_h_nr_running += rq->cfs.h_nr_running;
8559
8560 nr_running = rq->nr_running;
8561 sgs->sum_nr_running += nr_running;
8562
8563 if (nr_running > 1)
8564 *sg_status |= SG_OVERLOAD;
8565
8566 if (cpu_overutilized(i))
8567 *sg_status |= SG_OVERUTILIZED;
8568
8569#ifdef CONFIG_NUMA_BALANCING
8570 sgs->nr_numa_running += rq->nr_numa_running;
8571 sgs->nr_preferred_running += rq->nr_preferred_running;
8572#endif
8573
8574
8575
8576 if (!nr_running && idle_cpu(i)) {
8577 sgs->idle_cpus++;
8578
8579 continue;
8580 }
8581
8582 if (local_group)
8583 continue;
8584
8585
8586 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
8587 sgs->group_misfit_task_load < rq->misfit_task_load) {
8588 sgs->group_misfit_task_load = rq->misfit_task_load;
8589 *sg_status |= SG_OVERLOAD;
8590 }
8591 }
8592
8593
8594 if (env->sd->flags & SD_ASYM_PACKING &&
8595 env->idle != CPU_NOT_IDLE &&
8596 sgs->sum_h_nr_running &&
8597 sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
8598 sgs->group_asym_packing = 1;
8599 }
8600
8601 sgs->group_capacity = group->sgc->capacity;
8602
8603 sgs->group_weight = group->group_weight;
8604
8605 sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
8606
8607
8608 if (sgs->group_type == group_overloaded)
8609 sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
8610 sgs->group_capacity;
8611}
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626static bool update_sd_pick_busiest(struct lb_env *env,
8627 struct sd_lb_stats *sds,
8628 struct sched_group *sg,
8629 struct sg_lb_stats *sgs)
8630{
8631 struct sg_lb_stats *busiest = &sds->busiest_stat;
8632
8633
8634 if (!sgs->sum_h_nr_running)
8635 return false;
8636
8637
8638
8639
8640
8641
8642
8643 if (sgs->group_type == group_misfit_task &&
8644 (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
8645 sds->local_stat.group_type != group_has_spare))
8646 return false;
8647
8648 if (sgs->group_type > busiest->group_type)
8649 return true;
8650
8651 if (sgs->group_type < busiest->group_type)
8652 return false;
8653
8654
8655
8656
8657
8658
8659 switch (sgs->group_type) {
8660 case group_overloaded:
8661
8662 if (sgs->avg_load <= busiest->avg_load)
8663 return false;
8664 break;
8665
8666 case group_imbalanced:
8667
8668
8669
8670
8671 return false;
8672
8673 case group_asym_packing:
8674
8675 if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
8676 return false;
8677 break;
8678
8679 case group_misfit_task:
8680
8681
8682
8683
8684 if (sgs->group_misfit_task_load < busiest->group_misfit_task_load)
8685 return false;
8686 break;
8687
8688 case group_fully_busy:
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699 if (sgs->avg_load <= busiest->avg_load)
8700 return false;
8701 break;
8702
8703 case group_has_spare:
8704
8705
8706
8707
8708
8709
8710
8711 if (sgs->idle_cpus > busiest->idle_cpus)
8712 return false;
8713 else if ((sgs->idle_cpus == busiest->idle_cpus) &&
8714 (sgs->sum_nr_running <= busiest->sum_nr_running))
8715 return false;
8716
8717 break;
8718 }
8719
8720
8721
8722
8723
8724
8725
8726 if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
8727 (sgs->group_type <= group_fully_busy) &&
8728 (capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu))))
8729 return false;
8730
8731 return true;
8732}
8733
8734#ifdef CONFIG_NUMA_BALANCING
8735static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
8736{
8737 if (sgs->sum_h_nr_running > sgs->nr_numa_running)
8738 return regular;
8739 if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
8740 return remote;
8741 return all;
8742}
8743
8744static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8745{
8746 if (rq->nr_running > rq->nr_numa_running)
8747 return regular;
8748 if (rq->nr_running > rq->nr_preferred_running)
8749 return remote;
8750 return all;
8751}
8752#else
8753static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
8754{
8755 return all;
8756}
8757
8758static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8759{
8760 return regular;
8761}
8762#endif
8763
8764
8765struct sg_lb_stats;
8766
8767
8768
8769
8770
8771static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
8772{
8773
8774 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
8775 return 0;
8776
8777 if (task_on_rq_queued(p))
8778 return 1;
8779
8780 return 0;
8781}
8782
8783
8784
8785
8786
8787
8788
8789
8790static int idle_cpu_without(int cpu, struct task_struct *p)
8791{
8792 struct rq *rq = cpu_rq(cpu);
8793
8794 if (rq->curr != rq->idle && rq->curr != p)
8795 return 0;
8796
8797
8798
8799
8800
8801
8802
8803#ifdef CONFIG_SMP
8804 if (rq->ttwu_pending)
8805 return 0;
8806#endif
8807
8808 return 1;
8809}
8810
8811
8812
8813
8814
8815
8816
8817
8818static inline void update_sg_wakeup_stats(struct sched_domain *sd,
8819 struct sched_group *group,
8820 struct sg_lb_stats *sgs,
8821 struct task_struct *p)
8822{
8823 int i, nr_running;
8824
8825 memset(sgs, 0, sizeof(*sgs));
8826
8827 for_each_cpu(i, sched_group_span(group)) {
8828 struct rq *rq = cpu_rq(i);
8829 unsigned int local;
8830
8831 sgs->group_load += cpu_load_without(rq, p);
8832 sgs->group_util += cpu_util_without(i, p);
8833 sgs->group_runnable += cpu_runnable_without(rq, p);
8834 local = task_running_on_cpu(i, p);
8835 sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
8836
8837 nr_running = rq->nr_running - local;
8838 sgs->sum_nr_running += nr_running;
8839
8840
8841
8842
8843 if (!nr_running && idle_cpu_without(i, p))
8844 sgs->idle_cpus++;
8845
8846 }
8847
8848
8849 if (sd->flags & SD_ASYM_CPUCAPACITY &&
8850 !task_fits_capacity(p, group->sgc->max_capacity)) {
8851 sgs->group_misfit_task_load = 1;
8852 }
8853
8854 sgs->group_capacity = group->sgc->capacity;
8855
8856 sgs->group_weight = group->group_weight;
8857
8858 sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
8859
8860
8861
8862
8863
8864 if (sgs->group_type == group_fully_busy ||
8865 sgs->group_type == group_overloaded)
8866 sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
8867 sgs->group_capacity;
8868}
8869
8870static bool update_pick_idlest(struct sched_group *idlest,
8871 struct sg_lb_stats *idlest_sgs,
8872 struct sched_group *group,
8873 struct sg_lb_stats *sgs)
8874{
8875 if (sgs->group_type < idlest_sgs->group_type)
8876 return true;
8877
8878 if (sgs->group_type > idlest_sgs->group_type)
8879 return false;
8880
8881
8882
8883
8884
8885
8886 switch (sgs->group_type) {
8887 case group_overloaded:
8888 case group_fully_busy:
8889
8890 if (idlest_sgs->avg_load <= sgs->avg_load)
8891 return false;
8892 break;
8893
8894 case group_imbalanced:
8895 case group_asym_packing:
8896
8897 return false;
8898
8899 case group_misfit_task:
8900
8901 if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
8902 return false;
8903 break;
8904
8905 case group_has_spare:
8906
8907 if (idlest_sgs->idle_cpus > sgs->idle_cpus)
8908 return false;
8909
8910
8911 if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
8912 idlest_sgs->group_util <= sgs->group_util)
8913 return false;
8914
8915 break;
8916 }
8917
8918 return true;
8919}
8920
8921
8922
8923
8924
8925
8926static inline bool allow_numa_imbalance(int dst_running, int dst_weight)
8927{
8928 return (dst_running < (dst_weight >> 2));
8929}
8930
8931
8932
8933
8934
8935
8936
8937static struct sched_group *
8938find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
8939{
8940 struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
8941 struct sg_lb_stats local_sgs, tmp_sgs;
8942 struct sg_lb_stats *sgs;
8943 unsigned long imbalance;
8944 struct sg_lb_stats idlest_sgs = {
8945 .avg_load = UINT_MAX,
8946 .group_type = group_overloaded,
8947 };
8948
8949 do {
8950 int local_group;
8951
8952
8953 if (!cpumask_intersects(sched_group_span(group),
8954 p->cpus_ptr))
8955 continue;
8956
8957
8958 if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group))
8959 continue;
8960
8961 local_group = cpumask_test_cpu(this_cpu,
8962 sched_group_span(group));
8963
8964 if (local_group) {
8965 sgs = &local_sgs;
8966 local = group;
8967 } else {
8968 sgs = &tmp_sgs;
8969 }
8970
8971 update_sg_wakeup_stats(sd, group, sgs, p);
8972
8973 if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
8974 idlest = group;
8975 idlest_sgs = *sgs;
8976 }
8977
8978 } while (group = group->next, group != sd->groups);
8979
8980
8981
8982 if (!idlest)
8983 return NULL;
8984
8985
8986 if (!local)
8987 return idlest;
8988
8989
8990
8991
8992
8993 if (local_sgs.group_type < idlest_sgs.group_type)
8994 return NULL;
8995
8996
8997
8998
8999
9000 if (local_sgs.group_type > idlest_sgs.group_type)
9001 return idlest;
9002
9003 switch (local_sgs.group_type) {
9004 case group_overloaded:
9005 case group_fully_busy:
9006
9007
9008 imbalance = scale_load_down(NICE_0_LOAD) *
9009 (sd->imbalance_pct-100) / 100;
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020 if ((sd->flags & SD_NUMA) &&
9021 ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
9022 return NULL;
9023
9024
9025
9026
9027
9028 if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
9029 return NULL;
9030
9031 if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
9032 return NULL;
9033 break;
9034
9035 case group_imbalanced:
9036 case group_asym_packing:
9037
9038 return NULL;
9039
9040 case group_misfit_task:
9041
9042 if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
9043 return NULL;
9044 break;
9045
9046 case group_has_spare:
9047 if (sd->flags & SD_NUMA) {
9048#ifdef CONFIG_NUMA_BALANCING
9049 int idlest_cpu;
9050
9051
9052
9053
9054 if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
9055 return NULL;
9056
9057 idlest_cpu = cpumask_first(sched_group_span(idlest));
9058 if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
9059 return idlest;
9060#endif
9061
9062
9063
9064
9065
9066
9067 if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight))
9068 return NULL;
9069 }
9070
9071
9072
9073
9074
9075
9076
9077 if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
9078 return NULL;
9079 break;
9080 }
9081
9082 return idlest;
9083}
9084
9085
9086
9087
9088
9089
9090
9091static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
9092{
9093 struct sched_domain *child = env->sd->child;
9094 struct sched_group *sg = env->sd->groups;
9095 struct sg_lb_stats *local = &sds->local_stat;
9096 struct sg_lb_stats tmp_sgs;
9097 int sg_status = 0;
9098
9099 do {
9100 struct sg_lb_stats *sgs = &tmp_sgs;
9101 int local_group;
9102
9103 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
9104 if (local_group) {
9105 sds->local = sg;
9106 sgs = local;
9107
9108 if (env->idle != CPU_NEWLY_IDLE ||
9109 time_after_eq(jiffies, sg->sgc->next_update))
9110 update_group_capacity(env->sd, env->dst_cpu);
9111 }
9112
9113 update_sg_lb_stats(env, sg, sgs, &sg_status);
9114
9115 if (local_group)
9116 goto next_group;
9117
9118
9119 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
9120 sds->busiest = sg;
9121 sds->busiest_stat = *sgs;
9122 }
9123
9124next_group:
9125
9126 sds->total_load += sgs->group_load;
9127 sds->total_capacity += sgs->group_capacity;
9128
9129 sg = sg->next;
9130 } while (sg != env->sd->groups);
9131
9132
9133 sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
9134
9135
9136 if (env->sd->flags & SD_NUMA)
9137 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
9138
9139 if (!env->sd->parent) {
9140 struct root_domain *rd = env->dst_rq->rd;
9141
9142
9143 WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
9144
9145
9146 WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
9147 trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
9148 } else if (sg_status & SG_OVERUTILIZED) {
9149 struct root_domain *rd = env->dst_rq->rd;
9150
9151 WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
9152 trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
9153 }
9154}
9155
9156#define NUMA_IMBALANCE_MIN 2
9157
9158static inline long adjust_numa_imbalance(int imbalance,
9159 int dst_running, int dst_weight)
9160{
9161 if (!allow_numa_imbalance(dst_running, dst_weight))
9162 return imbalance;
9163
9164
9165
9166
9167
9168 if (imbalance <= NUMA_IMBALANCE_MIN)
9169 return 0;
9170
9171 return imbalance;
9172}
9173
9174
9175
9176
9177
9178
9179
9180static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
9181{
9182 struct sg_lb_stats *local, *busiest;
9183
9184 local = &sds->local_stat;
9185 busiest = &sds->busiest_stat;
9186
9187 if (busiest->group_type == group_misfit_task) {
9188
9189 env->migration_type = migrate_misfit;
9190 env->imbalance = 1;
9191 return;
9192 }
9193
9194 if (busiest->group_type == group_asym_packing) {
9195
9196
9197
9198
9199 env->migration_type = migrate_task;
9200 env->imbalance = busiest->sum_h_nr_running;
9201 return;
9202 }
9203
9204 if (busiest->group_type == group_imbalanced) {
9205
9206
9207
9208
9209
9210
9211 env->migration_type = migrate_task;
9212 env->imbalance = 1;
9213 return;
9214 }
9215
9216
9217
9218
9219
9220 if (local->group_type == group_has_spare) {
9221 if ((busiest->group_type > group_fully_busy) &&
9222 !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) {
9223
9224
9225
9226
9227
9228
9229
9230
9231 env->migration_type = migrate_util;
9232 env->imbalance = max(local->group_capacity, local->group_util) -
9233 local->group_util;
9234
9235
9236
9237
9238
9239
9240
9241
9242 if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
9243 env->migration_type = migrate_task;
9244 env->imbalance = 1;
9245 }
9246
9247 return;
9248 }
9249
9250 if (busiest->group_weight == 1 || sds->prefer_sibling) {
9251 unsigned int nr_diff = busiest->sum_nr_running;
9252
9253
9254
9255
9256 env->migration_type = migrate_task;
9257 lsub_positive(&nr_diff, local->sum_nr_running);
9258 env->imbalance = nr_diff >> 1;
9259 } else {
9260
9261
9262
9263
9264
9265 env->migration_type = migrate_task;
9266 env->imbalance = max_t(long, 0, (local->idle_cpus -
9267 busiest->idle_cpus) >> 1);
9268 }
9269
9270
9271 if (env->sd->flags & SD_NUMA) {
9272 env->imbalance = adjust_numa_imbalance(env->imbalance,
9273 busiest->sum_nr_running, busiest->group_weight);
9274 }
9275
9276 return;
9277 }
9278
9279
9280
9281
9282
9283 if (local->group_type < group_overloaded) {
9284
9285
9286
9287
9288
9289 local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
9290 local->group_capacity;
9291
9292 sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
9293 sds->total_capacity;
9294
9295
9296
9297
9298 if (local->avg_load >= busiest->avg_load) {
9299 env->imbalance = 0;
9300 return;
9301 }
9302 }
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312 env->migration_type = migrate_load;
9313 env->imbalance = min(
9314 (busiest->avg_load - sds->avg_load) * busiest->group_capacity,
9315 (sds->avg_load - local->avg_load) * local->group_capacity
9316 ) / SCHED_CAPACITY_SCALE;
9317}
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352static struct sched_group *find_busiest_group(struct lb_env *env)
9353{
9354 struct sg_lb_stats *local, *busiest;
9355 struct sd_lb_stats sds;
9356
9357 init_sd_lb_stats(&sds);
9358
9359
9360
9361
9362
9363 update_sd_lb_stats(env, &sds);
9364
9365 if (sched_energy_enabled()) {
9366 struct root_domain *rd = env->dst_rq->rd;
9367
9368 if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
9369 goto out_balanced;
9370 }
9371
9372 local = &sds.local_stat;
9373 busiest = &sds.busiest_stat;
9374
9375
9376 if (!sds.busiest)
9377 goto out_balanced;
9378
9379
9380 if (busiest->group_type == group_misfit_task)
9381 goto force_balance;
9382
9383
9384 if (busiest->group_type == group_asym_packing)
9385 goto force_balance;
9386
9387
9388
9389
9390
9391
9392 if (busiest->group_type == group_imbalanced)
9393 goto force_balance;
9394
9395
9396
9397
9398
9399 if (local->group_type > busiest->group_type)
9400 goto out_balanced;
9401
9402
9403
9404
9405
9406 if (local->group_type == group_overloaded) {
9407
9408
9409
9410
9411 if (local->avg_load >= busiest->avg_load)
9412 goto out_balanced;
9413
9414
9415 sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
9416 sds.total_capacity;
9417
9418
9419
9420
9421
9422 if (local->avg_load >= sds.avg_load)
9423 goto out_balanced;
9424
9425
9426
9427
9428
9429 if (100 * busiest->avg_load <=
9430 env->sd->imbalance_pct * local->avg_load)
9431 goto out_balanced;
9432 }
9433
9434
9435 if (sds.prefer_sibling && local->group_type == group_has_spare &&
9436 busiest->sum_nr_running > local->sum_nr_running + 1)
9437 goto force_balance;
9438
9439 if (busiest->group_type != group_overloaded) {
9440 if (env->idle == CPU_NOT_IDLE)
9441
9442
9443
9444
9445
9446 goto out_balanced;
9447
9448 if (busiest->group_weight > 1 &&
9449 local->idle_cpus <= (busiest->idle_cpus + 1))
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459 goto out_balanced;
9460
9461 if (busiest->sum_h_nr_running == 1)
9462
9463
9464
9465 goto out_balanced;
9466 }
9467
9468force_balance:
9469
9470 calculate_imbalance(env, &sds);
9471 return env->imbalance ? sds.busiest : NULL;
9472
9473out_balanced:
9474 env->imbalance = 0;
9475 return NULL;
9476}
9477
9478
9479
9480
9481static struct rq *find_busiest_queue(struct lb_env *env,
9482 struct sched_group *group)
9483{
9484 struct rq *busiest = NULL, *rq;
9485 unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
9486 unsigned int busiest_nr = 0;
9487 int i;
9488
9489 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
9490 unsigned long capacity, load, util;
9491 unsigned int nr_running;
9492 enum fbq_type rt;
9493
9494 rq = cpu_rq(i);
9495 rt = fbq_classify_rq(rq);
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516 if (rt > env->fbq_type)
9517 continue;
9518
9519 nr_running = rq->cfs.h_nr_running;
9520 if (!nr_running)
9521 continue;
9522
9523 capacity = capacity_of(i);
9524
9525
9526
9527
9528
9529
9530
9531 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
9532 !capacity_greater(capacity_of(env->dst_cpu), capacity) &&
9533 nr_running == 1)
9534 continue;
9535
9536 switch (env->migration_type) {
9537 case migrate_load:
9538
9539
9540
9541
9542 load = cpu_load(rq);
9543
9544 if (nr_running == 1 && load > env->imbalance &&
9545 !check_cpu_capacity(rq, env->sd))
9546 break;
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561 if (load * busiest_capacity > busiest_load * capacity) {
9562 busiest_load = load;
9563 busiest_capacity = capacity;
9564 busiest = rq;
9565 }
9566 break;
9567
9568 case migrate_util:
9569 util = cpu_util(cpu_of(rq));
9570
9571
9572
9573
9574
9575
9576 if (nr_running <= 1)
9577 continue;
9578
9579 if (busiest_util < util) {
9580 busiest_util = util;
9581 busiest = rq;
9582 }
9583 break;
9584
9585 case migrate_task:
9586 if (busiest_nr < nr_running) {
9587 busiest_nr = nr_running;
9588 busiest = rq;
9589 }
9590 break;
9591
9592 case migrate_misfit:
9593
9594
9595
9596
9597 if (rq->misfit_task_load > busiest_load) {
9598 busiest_load = rq->misfit_task_load;
9599 busiest = rq;
9600 }
9601
9602 break;
9603
9604 }
9605 }
9606
9607 return busiest;
9608}
9609
9610
9611
9612
9613
9614#define MAX_PINNED_INTERVAL 512
9615
9616static inline bool
9617asym_active_balance(struct lb_env *env)
9618{
9619
9620
9621
9622
9623
9624 return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
9625 sched_asym_prefer(env->dst_cpu, env->src_cpu);
9626}
9627
9628static inline bool
9629imbalanced_active_balance(struct lb_env *env)
9630{
9631 struct sched_domain *sd = env->sd;
9632
9633
9634
9635
9636
9637
9638 if ((env->migration_type == migrate_task) &&
9639 (sd->nr_balance_failed > sd->cache_nice_tries+2))
9640 return 1;
9641
9642 return 0;
9643}
9644
9645static int need_active_balance(struct lb_env *env)
9646{
9647 struct sched_domain *sd = env->sd;
9648
9649 if (asym_active_balance(env))
9650 return 1;
9651
9652 if (imbalanced_active_balance(env))
9653 return 1;
9654
9655
9656
9657
9658
9659
9660
9661 if ((env->idle != CPU_NOT_IDLE) &&
9662 (env->src_rq->cfs.h_nr_running == 1)) {
9663 if ((check_cpu_capacity(env->src_rq, sd)) &&
9664 (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
9665 return 1;
9666 }
9667
9668 if (env->migration_type == migrate_misfit)
9669 return 1;
9670
9671 return 0;
9672}
9673
9674static int active_load_balance_cpu_stop(void *data);
9675
9676static int should_we_balance(struct lb_env *env)
9677{
9678 struct sched_group *sg = env->sd->groups;
9679 int cpu;
9680
9681
9682
9683
9684
9685 if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
9686 return 0;
9687
9688
9689
9690
9691
9692 if (env->idle == CPU_NEWLY_IDLE)
9693 return 1;
9694
9695
9696 for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
9697 if (!idle_cpu(cpu))
9698 continue;
9699
9700
9701 return cpu == env->dst_cpu;
9702 }
9703
9704
9705 return group_balance_cpu(sg) == env->dst_cpu;
9706}
9707
9708
9709
9710
9711
9712static int load_balance(int this_cpu, struct rq *this_rq,
9713 struct sched_domain *sd, enum cpu_idle_type idle,
9714 int *continue_balancing)
9715{
9716 int ld_moved, cur_ld_moved, active_balance = 0;
9717 struct sched_domain *sd_parent = sd->parent;
9718 struct sched_group *group;
9719 struct rq *busiest;
9720 struct rq_flags rf;
9721 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
9722
9723 struct lb_env env = {
9724 .sd = sd,
9725 .dst_cpu = this_cpu,
9726 .dst_rq = this_rq,
9727 .dst_grpmask = sched_group_span(sd->groups),
9728 .idle = idle,
9729 .loop_break = sched_nr_migrate_break,
9730 .cpus = cpus,
9731 .fbq_type = all,
9732 .tasks = LIST_HEAD_INIT(env.tasks),
9733 };
9734
9735 cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
9736
9737 schedstat_inc(sd->lb_count[idle]);
9738
9739redo:
9740 if (!should_we_balance(&env)) {
9741 *continue_balancing = 0;
9742 goto out_balanced;
9743 }
9744
9745 group = find_busiest_group(&env);
9746 if (!group) {
9747 schedstat_inc(sd->lb_nobusyg[idle]);
9748 goto out_balanced;
9749 }
9750
9751 busiest = find_busiest_queue(&env, group);
9752 if (!busiest) {
9753 schedstat_inc(sd->lb_nobusyq[idle]);
9754 goto out_balanced;
9755 }
9756
9757 BUG_ON(busiest == env.dst_rq);
9758
9759 schedstat_add(sd->lb_imbalance[idle], env.imbalance);
9760
9761 env.src_cpu = busiest->cpu;
9762 env.src_rq = busiest;
9763
9764 ld_moved = 0;
9765
9766 env.flags |= LBF_ALL_PINNED;
9767 if (busiest->nr_running > 1) {
9768
9769
9770
9771
9772
9773
9774 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
9775
9776more_balance:
9777 rq_lock_irqsave(busiest, &rf);
9778 update_rq_clock(busiest);
9779
9780
9781
9782
9783
9784 cur_ld_moved = detach_tasks(&env);
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794 rq_unlock(busiest, &rf);
9795
9796 if (cur_ld_moved) {
9797 attach_tasks(&env);
9798 ld_moved += cur_ld_moved;
9799 }
9800
9801 local_irq_restore(rf.flags);
9802
9803 if (env.flags & LBF_NEED_BREAK) {
9804 env.flags &= ~LBF_NEED_BREAK;
9805 goto more_balance;
9806 }
9807
9808
9809
9810
9811
9812
9813
9814
9815
9816
9817
9818
9819
9820
9821
9822
9823
9824
9825
9826
9827 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
9828
9829
9830 __cpumask_clear_cpu(env.dst_cpu, env.cpus);
9831
9832 env.dst_rq = cpu_rq(env.new_dst_cpu);
9833 env.dst_cpu = env.new_dst_cpu;
9834 env.flags &= ~LBF_DST_PINNED;
9835 env.loop = 0;
9836 env.loop_break = sched_nr_migrate_break;
9837
9838
9839
9840
9841
9842 goto more_balance;
9843 }
9844
9845
9846
9847
9848 if (sd_parent) {
9849 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
9850
9851 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
9852 *group_imbalance = 1;
9853 }
9854
9855
9856 if (unlikely(env.flags & LBF_ALL_PINNED)) {
9857 __cpumask_clear_cpu(cpu_of(busiest), cpus);
9858
9859
9860
9861
9862
9863
9864
9865
9866 if (!cpumask_subset(cpus, env.dst_grpmask)) {
9867 env.loop = 0;
9868 env.loop_break = sched_nr_migrate_break;
9869 goto redo;
9870 }
9871 goto out_all_pinned;
9872 }
9873 }
9874
9875 if (!ld_moved) {
9876 schedstat_inc(sd->lb_failed[idle]);
9877
9878
9879
9880
9881
9882
9883 if (idle != CPU_NEWLY_IDLE)
9884 sd->nr_balance_failed++;
9885
9886 if (need_active_balance(&env)) {
9887 unsigned long flags;
9888
9889 raw_spin_rq_lock_irqsave(busiest, flags);
9890
9891
9892
9893
9894
9895
9896 if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
9897 raw_spin_rq_unlock_irqrestore(busiest, flags);
9898 goto out_one_pinned;
9899 }
9900
9901
9902 env.flags &= ~LBF_ALL_PINNED;
9903
9904
9905
9906
9907
9908
9909 if (!busiest->active_balance) {
9910 busiest->active_balance = 1;
9911 busiest->push_cpu = this_cpu;
9912 active_balance = 1;
9913 }
9914 raw_spin_rq_unlock_irqrestore(busiest, flags);
9915
9916 if (active_balance) {
9917 stop_one_cpu_nowait(cpu_of(busiest),
9918 active_load_balance_cpu_stop, busiest,
9919 &busiest->active_balance_work);
9920 }
9921 }
9922 } else {
9923 sd->nr_balance_failed = 0;
9924 }
9925
9926 if (likely(!active_balance) || need_active_balance(&env)) {
9927
9928 sd->balance_interval = sd->min_interval;
9929 }
9930
9931 goto out;
9932
9933out_balanced:
9934
9935
9936
9937
9938
9939 if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
9940 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
9941
9942 if (*group_imbalance)
9943 *group_imbalance = 0;
9944 }
9945
9946out_all_pinned:
9947
9948
9949
9950
9951
9952 schedstat_inc(sd->lb_balanced[idle]);
9953
9954 sd->nr_balance_failed = 0;
9955
9956out_one_pinned:
9957 ld_moved = 0;
9958
9959
9960
9961
9962
9963
9964
9965 if (env.idle == CPU_NEWLY_IDLE)
9966 goto out;
9967
9968
9969 if ((env.flags & LBF_ALL_PINNED &&
9970 sd->balance_interval < MAX_PINNED_INTERVAL) ||
9971 sd->balance_interval < sd->max_interval)
9972 sd->balance_interval *= 2;
9973out:
9974 return ld_moved;
9975}
9976
9977static inline unsigned long
9978get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
9979{
9980 unsigned long interval = sd->balance_interval;
9981
9982 if (cpu_busy)
9983 interval *= sd->busy_factor;
9984
9985
9986 interval = msecs_to_jiffies(interval);
9987
9988
9989
9990
9991
9992
9993 if (cpu_busy)
9994 interval -= 1;
9995
9996 interval = clamp(interval, 1UL, max_load_balance_interval);
9997
9998 return interval;
9999}
10000
10001static inline void
10002update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
10003{
10004 unsigned long interval, next;
10005
10006
10007 interval = get_sd_balance_interval(sd, 0);
10008 next = sd->last_balance + interval;
10009
10010 if (time_after(*next_balance, next))
10011 *next_balance = next;
10012}
10013
10014
10015
10016
10017
10018
10019
10020static int active_load_balance_cpu_stop(void *data)
10021{
10022 struct rq *busiest_rq = data;
10023 int busiest_cpu = cpu_of(busiest_rq);
10024 int target_cpu = busiest_rq->push_cpu;
10025 struct rq *target_rq = cpu_rq(target_cpu);
10026 struct sched_domain *sd;
10027 struct task_struct *p = NULL;
10028 struct rq_flags rf;
10029
10030 rq_lock_irq(busiest_rq, &rf);
10031
10032
10033
10034
10035
10036 if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
10037 goto out_unlock;
10038
10039
10040 if (unlikely(busiest_cpu != smp_processor_id() ||
10041 !busiest_rq->active_balance))
10042 goto out_unlock;
10043
10044
10045 if (busiest_rq->nr_running <= 1)
10046 goto out_unlock;
10047
10048
10049
10050
10051
10052
10053 BUG_ON(busiest_rq == target_rq);
10054
10055
10056 rcu_read_lock();
10057 for_each_domain(target_cpu, sd) {
10058 if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
10059 break;
10060 }
10061
10062 if (likely(sd)) {
10063 struct lb_env env = {
10064 .sd = sd,
10065 .dst_cpu = target_cpu,
10066 .dst_rq = target_rq,
10067 .src_cpu = busiest_rq->cpu,
10068 .src_rq = busiest_rq,
10069 .idle = CPU_IDLE,
10070 .flags = LBF_ACTIVE_LB,
10071 };
10072
10073 schedstat_inc(sd->alb_count);
10074 update_rq_clock(busiest_rq);
10075
10076 p = detach_one_task(&env);
10077 if (p) {
10078 schedstat_inc(sd->alb_pushed);
10079
10080 sd->nr_balance_failed = 0;
10081 } else {
10082 schedstat_inc(sd->alb_failed);
10083 }
10084 }
10085 rcu_read_unlock();
10086out_unlock:
10087 busiest_rq->active_balance = 0;
10088 rq_unlock(busiest_rq, &rf);
10089
10090 if (p)
10091 attach_one_task(target_rq, p);
10092
10093 local_irq_enable();
10094
10095 return 0;
10096}
10097
10098static DEFINE_SPINLOCK(balancing);
10099
10100
10101
10102
10103
10104void update_max_interval(void)
10105{
10106 max_load_balance_interval = HZ*num_online_cpus()/10;
10107}
10108
10109
10110
10111
10112
10113
10114
10115static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
10116{
10117 int continue_balancing = 1;
10118 int cpu = rq->cpu;
10119 int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
10120 unsigned long interval;
10121 struct sched_domain *sd;
10122
10123 unsigned long next_balance = jiffies + 60*HZ;
10124 int update_next_balance = 0;
10125 int need_serialize, need_decay = 0;
10126 u64 max_cost = 0;
10127
10128 rcu_read_lock();
10129 for_each_domain(cpu, sd) {
10130
10131
10132
10133
10134 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
10135 sd->max_newidle_lb_cost =
10136 (sd->max_newidle_lb_cost * 253) / 256;
10137 sd->next_decay_max_lb_cost = jiffies + HZ;
10138 need_decay = 1;
10139 }
10140 max_cost += sd->max_newidle_lb_cost;
10141
10142
10143
10144
10145
10146
10147 if (!continue_balancing) {
10148 if (need_decay)
10149 continue;
10150 break;
10151 }
10152
10153 interval = get_sd_balance_interval(sd, busy);
10154
10155 need_serialize = sd->flags & SD_SERIALIZE;
10156 if (need_serialize) {
10157 if (!spin_trylock(&balancing))
10158 goto out;
10159 }
10160
10161 if (time_after_eq(jiffies, sd->last_balance + interval)) {
10162 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
10163
10164
10165
10166
10167
10168 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
10169 busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
10170 }
10171 sd->last_balance = jiffies;
10172 interval = get_sd_balance_interval(sd, busy);
10173 }
10174 if (need_serialize)
10175 spin_unlock(&balancing);
10176out:
10177 if (time_after(next_balance, sd->last_balance + interval)) {
10178 next_balance = sd->last_balance + interval;
10179 update_next_balance = 1;
10180 }
10181 }
10182 if (need_decay) {
10183
10184
10185
10186
10187 rq->max_idle_balance_cost =
10188 max((u64)sysctl_sched_migration_cost, max_cost);
10189 }
10190 rcu_read_unlock();
10191
10192
10193
10194
10195
10196
10197 if (likely(update_next_balance))
10198 rq->next_balance = next_balance;
10199
10200}
10201
10202static inline int on_null_domain(struct rq *rq)
10203{
10204 return unlikely(!rcu_dereference_sched(rq->sd));
10205}
10206
10207#ifdef CONFIG_NO_HZ_COMMON
10208
10209
10210
10211
10212
10213
10214
10215
10216
10217static inline int find_new_ilb(void)
10218{
10219 int ilb;
10220
10221 for_each_cpu_and(ilb, nohz.idle_cpus_mask,
10222 housekeeping_cpumask(HK_FLAG_MISC)) {
10223
10224 if (ilb == smp_processor_id())
10225 continue;
10226
10227 if (idle_cpu(ilb))
10228 return ilb;
10229 }
10230
10231 return nr_cpu_ids;
10232}
10233
10234
10235
10236
10237
10238static void kick_ilb(unsigned int flags)
10239{
10240 int ilb_cpu;
10241
10242
10243
10244
10245
10246 if (flags & NOHZ_BALANCE_KICK)
10247 nohz.next_balance = jiffies+1;
10248
10249 ilb_cpu = find_new_ilb();
10250
10251 if (ilb_cpu >= nr_cpu_ids)
10252 return;
10253
10254
10255
10256
10257
10258 flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
10259 if (flags & NOHZ_KICK_MASK)
10260 return;
10261
10262
10263
10264
10265
10266
10267 smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
10268}
10269
10270
10271
10272
10273
10274static void nohz_balancer_kick(struct rq *rq)
10275{
10276 unsigned long now = jiffies;
10277 struct sched_domain_shared *sds;
10278 struct sched_domain *sd;
10279 int nr_busy, i, cpu = rq->cpu;
10280 unsigned int flags = 0;
10281
10282 if (unlikely(rq->idle_balance))
10283 return;
10284
10285
10286
10287
10288
10289 nohz_balance_exit_idle(rq);
10290
10291
10292
10293
10294
10295 if (likely(!atomic_read(&nohz.nr_cpus)))
10296 return;
10297
10298 if (READ_ONCE(nohz.has_blocked) &&
10299 time_after(now, READ_ONCE(nohz.next_blocked)))
10300 flags = NOHZ_STATS_KICK;
10301
10302 if (time_before(now, nohz.next_balance))
10303 goto out;
10304
10305 if (rq->nr_running >= 2) {
10306 flags = NOHZ_KICK_MASK;
10307 goto out;
10308 }
10309
10310 rcu_read_lock();
10311
10312 sd = rcu_dereference(rq->sd);
10313 if (sd) {
10314
10315
10316
10317
10318
10319 if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
10320 flags = NOHZ_KICK_MASK;
10321 goto unlock;
10322 }
10323 }
10324
10325 sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
10326 if (sd) {
10327
10328
10329
10330
10331
10332 for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
10333 if (sched_asym_prefer(i, cpu)) {
10334 flags = NOHZ_KICK_MASK;
10335 goto unlock;
10336 }
10337 }
10338 }
10339
10340 sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
10341 if (sd) {
10342
10343
10344
10345
10346 if (check_misfit_status(rq, sd)) {
10347 flags = NOHZ_KICK_MASK;
10348 goto unlock;
10349 }
10350
10351
10352
10353
10354
10355
10356
10357
10358 goto unlock;
10359 }
10360
10361 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
10362 if (sds) {
10363
10364
10365
10366
10367
10368
10369
10370
10371
10372 nr_busy = atomic_read(&sds->nr_busy_cpus);
10373 if (nr_busy > 1) {
10374 flags = NOHZ_KICK_MASK;
10375 goto unlock;
10376 }
10377 }
10378unlock:
10379 rcu_read_unlock();
10380out:
10381 if (flags)
10382 kick_ilb(flags);
10383}
10384
10385static void set_cpu_sd_state_busy(int cpu)
10386{
10387 struct sched_domain *sd;
10388
10389 rcu_read_lock();
10390 sd = rcu_dereference(per_cpu(sd_llc, cpu));
10391
10392 if (!sd || !sd->nohz_idle)
10393 goto unlock;
10394 sd->nohz_idle = 0;
10395
10396 atomic_inc(&sd->shared->nr_busy_cpus);
10397unlock:
10398 rcu_read_unlock();
10399}
10400
10401void nohz_balance_exit_idle(struct rq *rq)
10402{
10403 SCHED_WARN_ON(rq != this_rq());
10404
10405 if (likely(!rq->nohz_tick_stopped))
10406 return;
10407
10408 rq->nohz_tick_stopped = 0;
10409 cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
10410 atomic_dec(&nohz.nr_cpus);
10411
10412 set_cpu_sd_state_busy(rq->cpu);
10413}
10414
10415static void set_cpu_sd_state_idle(int cpu)
10416{
10417 struct sched_domain *sd;
10418
10419 rcu_read_lock();
10420 sd = rcu_dereference(per_cpu(sd_llc, cpu));
10421
10422 if (!sd || sd->nohz_idle)
10423 goto unlock;
10424 sd->nohz_idle = 1;
10425
10426 atomic_dec(&sd->shared->nr_busy_cpus);
10427unlock:
10428 rcu_read_unlock();
10429}
10430
10431
10432
10433
10434
10435void nohz_balance_enter_idle(int cpu)
10436{
10437 struct rq *rq = cpu_rq(cpu);
10438
10439 SCHED_WARN_ON(cpu != smp_processor_id());
10440
10441
10442 if (!cpu_active(cpu))
10443 return;
10444
10445
10446 if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
10447 return;
10448
10449
10450
10451
10452
10453
10454 rq->has_blocked_load = 1;
10455
10456
10457
10458
10459
10460
10461
10462 if (rq->nohz_tick_stopped)
10463 goto out;
10464
10465
10466 if (on_null_domain(rq))
10467 return;
10468
10469 rq->nohz_tick_stopped = 1;
10470
10471 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
10472 atomic_inc(&nohz.nr_cpus);
10473
10474
10475
10476
10477
10478
10479 smp_mb__after_atomic();
10480
10481 set_cpu_sd_state_idle(cpu);
10482
10483out:
10484
10485
10486
10487
10488 WRITE_ONCE(nohz.has_blocked, 1);
10489}
10490
10491static bool update_nohz_stats(struct rq *rq)
10492{
10493 unsigned int cpu = rq->cpu;
10494
10495 if (!rq->has_blocked_load)
10496 return false;
10497
10498 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
10499 return false;
10500
10501 if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
10502 return true;
10503
10504 update_blocked_averages(cpu);
10505
10506 return rq->has_blocked_load;
10507}
10508
10509
10510
10511
10512
10513
10514static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
10515 enum cpu_idle_type idle)
10516{
10517
10518 unsigned long now = jiffies;
10519 unsigned long next_balance = now + 60*HZ;
10520 bool has_blocked_load = false;
10521 int update_next_balance = 0;
10522 int this_cpu = this_rq->cpu;
10523 int balance_cpu;
10524 struct rq *rq;
10525
10526 SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
10527
10528
10529
10530
10531
10532
10533
10534
10535
10536 WRITE_ONCE(nohz.has_blocked, 0);
10537
10538
10539
10540
10541
10542 smp_mb();
10543
10544
10545
10546
10547
10548 for_each_cpu_wrap(balance_cpu, nohz.idle_cpus_mask, this_cpu+1) {
10549 if (!idle_cpu(balance_cpu))
10550 continue;
10551
10552
10553
10554
10555
10556
10557 if (need_resched()) {
10558 has_blocked_load = true;
10559 goto abort;
10560 }
10561
10562 rq = cpu_rq(balance_cpu);
10563
10564 has_blocked_load |= update_nohz_stats(rq);
10565
10566
10567
10568
10569
10570 if (time_after_eq(jiffies, rq->next_balance)) {
10571 struct rq_flags rf;
10572
10573 rq_lock_irqsave(rq, &rf);
10574 update_rq_clock(rq);
10575 rq_unlock_irqrestore(rq, &rf);
10576
10577 if (flags & NOHZ_BALANCE_KICK)
10578 rebalance_domains(rq, CPU_IDLE);
10579 }
10580
10581 if (time_after(next_balance, rq->next_balance)) {
10582 next_balance = rq->next_balance;
10583 update_next_balance = 1;
10584 }
10585 }
10586
10587
10588
10589
10590
10591
10592 if (likely(update_next_balance))
10593 nohz.next_balance = next_balance;
10594
10595 WRITE_ONCE(nohz.next_blocked,
10596 now + msecs_to_jiffies(LOAD_AVG_PERIOD));
10597
10598abort:
10599
10600 if (has_blocked_load)
10601 WRITE_ONCE(nohz.has_blocked, 1);
10602}
10603
10604
10605
10606
10607
10608static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
10609{
10610 unsigned int flags = this_rq->nohz_idle_balance;
10611
10612 if (!flags)
10613 return false;
10614
10615 this_rq->nohz_idle_balance = 0;
10616
10617 if (idle != CPU_IDLE)
10618 return false;
10619
10620 _nohz_idle_balance(this_rq, flags, idle);
10621
10622 return true;
10623}
10624
10625
10626
10627
10628
10629void nohz_run_idle_balance(int cpu)
10630{
10631 unsigned int flags;
10632
10633 flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu));
10634
10635
10636
10637
10638
10639 if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
10640 _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK, CPU_IDLE);
10641}
10642
10643static void nohz_newidle_balance(struct rq *this_rq)
10644{
10645 int this_cpu = this_rq->cpu;
10646
10647
10648
10649
10650
10651 if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
10652 return;
10653
10654
10655 if (this_rq->avg_idle < sysctl_sched_migration_cost)
10656 return;
10657
10658
10659 if (!READ_ONCE(nohz.has_blocked) ||
10660 time_before(jiffies, READ_ONCE(nohz.next_blocked)))
10661 return;
10662
10663
10664
10665
10666
10667 atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
10668}
10669
10670#else
10671static inline void nohz_balancer_kick(struct rq *rq) { }
10672
10673static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
10674{
10675 return false;
10676}
10677
10678static inline void nohz_newidle_balance(struct rq *this_rq) { }
10679#endif
10680
10681
10682
10683
10684
10685
10686
10687
10688
10689
10690static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
10691{
10692 unsigned long next_balance = jiffies + HZ;
10693 int this_cpu = this_rq->cpu;
10694 struct sched_domain *sd;
10695 int pulled_task = 0;
10696 u64 curr_cost = 0;
10697
10698 update_misfit_status(NULL, this_rq);
10699
10700
10701
10702
10703
10704 if (this_rq->ttwu_pending)
10705 return 0;
10706
10707
10708
10709
10710
10711 this_rq->idle_stamp = rq_clock(this_rq);
10712
10713
10714
10715
10716 if (!cpu_active(this_cpu))
10717 return 0;
10718
10719
10720
10721
10722
10723
10724
10725 rq_unpin_lock(this_rq, rf);
10726
10727 if (this_rq->avg_idle < sysctl_sched_migration_cost ||
10728 !READ_ONCE(this_rq->rd->overload)) {
10729
10730 rcu_read_lock();
10731 sd = rcu_dereference_check_sched_domain(this_rq->sd);
10732 if (sd)
10733 update_next_balance(sd, &next_balance);
10734 rcu_read_unlock();
10735
10736 goto out;
10737 }
10738
10739 raw_spin_rq_unlock(this_rq);
10740
10741 update_blocked_averages(this_cpu);
10742 rcu_read_lock();
10743 for_each_domain(this_cpu, sd) {
10744 int continue_balancing = 1;
10745 u64 t0, domain_cost;
10746
10747 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
10748 update_next_balance(sd, &next_balance);
10749 break;
10750 }
10751
10752 if (sd->flags & SD_BALANCE_NEWIDLE) {
10753 t0 = sched_clock_cpu(this_cpu);
10754
10755 pulled_task = load_balance(this_cpu, this_rq,
10756 sd, CPU_NEWLY_IDLE,
10757 &continue_balancing);
10758
10759 domain_cost = sched_clock_cpu(this_cpu) - t0;
10760 if (domain_cost > sd->max_newidle_lb_cost)
10761 sd->max_newidle_lb_cost = domain_cost;
10762
10763 curr_cost += domain_cost;
10764 }
10765
10766 update_next_balance(sd, &next_balance);
10767
10768
10769
10770
10771
10772 if (pulled_task || this_rq->nr_running > 0 ||
10773 this_rq->ttwu_pending)
10774 break;
10775 }
10776 rcu_read_unlock();
10777
10778 raw_spin_rq_lock(this_rq);
10779
10780 if (curr_cost > this_rq->max_idle_balance_cost)
10781 this_rq->max_idle_balance_cost = curr_cost;
10782
10783
10784
10785
10786
10787
10788 if (this_rq->cfs.h_nr_running && !pulled_task)
10789 pulled_task = 1;
10790
10791
10792 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
10793 pulled_task = -1;
10794
10795out:
10796
10797 if (time_after(this_rq->next_balance, next_balance))
10798 this_rq->next_balance = next_balance;
10799
10800 if (pulled_task)
10801 this_rq->idle_stamp = 0;
10802 else
10803 nohz_newidle_balance(this_rq);
10804
10805 rq_repin_lock(this_rq, rf);
10806
10807 return pulled_task;
10808}
10809
10810
10811
10812
10813
10814static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
10815{
10816 struct rq *this_rq = this_rq();
10817 enum cpu_idle_type idle = this_rq->idle_balance ?
10818 CPU_IDLE : CPU_NOT_IDLE;
10819
10820
10821
10822
10823
10824
10825
10826
10827
10828 if (nohz_idle_balance(this_rq, idle))
10829 return;
10830
10831
10832 update_blocked_averages(this_rq->cpu);
10833 rebalance_domains(this_rq, idle);
10834}
10835
10836
10837
10838
10839void trigger_load_balance(struct rq *rq)
10840{
10841
10842
10843
10844
10845 if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq))))
10846 return;
10847
10848 if (time_after_eq(jiffies, rq->next_balance))
10849 raise_softirq(SCHED_SOFTIRQ);
10850
10851 nohz_balancer_kick(rq);
10852}
10853
10854static void rq_online_fair(struct rq *rq)
10855{
10856 update_sysctl();
10857
10858 update_runtime_enabled(rq);
10859}
10860
10861static void rq_offline_fair(struct rq *rq)
10862{
10863 update_sysctl();
10864
10865
10866 unthrottle_offline_cfs_rqs(rq);
10867}
10868
10869#endif
10870
10871#ifdef CONFIG_SCHED_CORE
10872static inline bool
10873__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
10874{
10875 u64 slice = sched_slice(cfs_rq_of(se), se);
10876 u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
10877
10878 return (rtime * min_nr_tasks > slice);
10879}
10880
10881#define MIN_NR_TASKS_DURING_FORCEIDLE 2
10882static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
10883{
10884 if (!sched_core_enabled(rq))
10885 return;
10886
10887
10888
10889
10890
10891
10892
10893
10894
10895
10896
10897
10898
10899
10900
10901 if (rq->core->core_forceidle && rq->cfs.nr_running == 1 &&
10902 __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
10903 resched_curr(rq);
10904}
10905
10906
10907
10908
10909static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle)
10910{
10911 for_each_sched_entity(se) {
10912 struct cfs_rq *cfs_rq = cfs_rq_of(se);
10913
10914 if (forceidle) {
10915 if (cfs_rq->forceidle_seq == fi_seq)
10916 break;
10917 cfs_rq->forceidle_seq = fi_seq;
10918 }
10919
10920 cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
10921 }
10922}
10923
10924void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi)
10925{
10926 struct sched_entity *se = &p->se;
10927
10928 if (p->sched_class != &fair_sched_class)
10929 return;
10930
10931 se_fi_update(se, rq->core->core_forceidle_seq, in_fi);
10932}
10933
10934bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
10935{
10936 struct rq *rq = task_rq(a);
10937 struct sched_entity *sea = &a->se;
10938 struct sched_entity *seb = &b->se;
10939 struct cfs_rq *cfs_rqa;
10940 struct cfs_rq *cfs_rqb;
10941 s64 delta;
10942
10943 SCHED_WARN_ON(task_rq(b)->core != rq->core);
10944
10945#ifdef CONFIG_FAIR_GROUP_SCHED
10946
10947
10948
10949
10950 while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
10951 int sea_depth = sea->depth;
10952 int seb_depth = seb->depth;
10953
10954 if (sea_depth >= seb_depth)
10955 sea = parent_entity(sea);
10956 if (sea_depth <= seb_depth)
10957 seb = parent_entity(seb);
10958 }
10959
10960 se_fi_update(sea, rq->core->core_forceidle_seq, in_fi);
10961 se_fi_update(seb, rq->core->core_forceidle_seq, in_fi);
10962
10963 cfs_rqa = sea->cfs_rq;
10964 cfs_rqb = seb->cfs_rq;
10965#else
10966 cfs_rqa = &task_rq(a)->cfs;
10967 cfs_rqb = &task_rq(b)->cfs;
10968#endif
10969
10970
10971
10972
10973
10974
10975 delta = (s64)(sea->vruntime - seb->vruntime) +
10976 (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
10977
10978 return delta > 0;
10979}
10980#else
10981static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
10982#endif
10983
10984
10985
10986
10987
10988
10989
10990
10991
10992static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
10993{
10994 struct cfs_rq *cfs_rq;
10995 struct sched_entity *se = &curr->se;
10996
10997 for_each_sched_entity(se) {
10998 cfs_rq = cfs_rq_of(se);
10999 entity_tick(cfs_rq, se, queued);
11000 }
11001
11002 if (static_branch_unlikely(&sched_numa_balancing))
11003 task_tick_numa(rq, curr);
11004
11005 update_misfit_status(curr, rq);
11006 update_overutilized_status(task_rq(curr));
11007
11008 task_tick_core(rq, curr);
11009}
11010
11011
11012
11013
11014
11015
11016static void task_fork_fair(struct task_struct *p)
11017{
11018 struct cfs_rq *cfs_rq;
11019 struct sched_entity *se = &p->se, *curr;
11020 struct rq *rq = this_rq();
11021 struct rq_flags rf;
11022
11023 rq_lock(rq, &rf);
11024 update_rq_clock(rq);
11025
11026 cfs_rq = task_cfs_rq(current);
11027 curr = cfs_rq->curr;
11028 if (curr) {
11029 update_curr(cfs_rq);
11030 se->vruntime = curr->vruntime;
11031 }
11032 place_entity(cfs_rq, se, 1);
11033
11034 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
11035
11036
11037
11038
11039 swap(curr->vruntime, se->vruntime);
11040 resched_curr(rq);
11041 }
11042
11043 se->vruntime -= cfs_rq->min_vruntime;
11044 rq_unlock(rq, &rf);
11045}
11046
11047
11048
11049
11050
11051static void
11052prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
11053{
11054 if (!task_on_rq_queued(p))
11055 return;
11056
11057 if (rq->cfs.nr_running == 1)
11058 return;
11059
11060
11061
11062
11063
11064
11065 if (task_current(rq, p)) {
11066 if (p->prio > oldprio)
11067 resched_curr(rq);
11068 } else
11069 check_preempt_curr(rq, p, 0);
11070}
11071
11072static inline bool vruntime_normalized(struct task_struct *p)
11073{
11074 struct sched_entity *se = &p->se;
11075
11076
11077
11078
11079
11080
11081 if (p->on_rq)
11082 return true;
11083
11084
11085
11086
11087
11088
11089
11090
11091
11092
11093 if (!se->sum_exec_runtime ||
11094 (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup))
11095 return true;
11096
11097 return false;
11098}
11099
11100#ifdef CONFIG_FAIR_GROUP_SCHED
11101
11102
11103
11104
11105static void propagate_entity_cfs_rq(struct sched_entity *se)
11106{
11107 struct cfs_rq *cfs_rq;
11108
11109 list_add_leaf_cfs_rq(cfs_rq_of(se));
11110
11111
11112 se = se->parent;
11113
11114 for_each_sched_entity(se) {
11115 cfs_rq = cfs_rq_of(se);
11116
11117 if (!cfs_rq_throttled(cfs_rq)){
11118 update_load_avg(cfs_rq, se, UPDATE_TG);
11119 list_add_leaf_cfs_rq(cfs_rq);
11120 continue;
11121 }
11122
11123 if (list_add_leaf_cfs_rq(cfs_rq))
11124 break;
11125 }
11126}
11127#else
11128static void propagate_entity_cfs_rq(struct sched_entity *se) { }
11129#endif
11130
11131static void detach_entity_cfs_rq(struct sched_entity *se)
11132{
11133 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11134
11135
11136 update_load_avg(cfs_rq, se, 0);
11137 detach_entity_load_avg(cfs_rq, se);
11138 update_tg_load_avg(cfs_rq);
11139 propagate_entity_cfs_rq(se);
11140}
11141
11142static void attach_entity_cfs_rq(struct sched_entity *se)
11143{
11144 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11145
11146#ifdef CONFIG_FAIR_GROUP_SCHED
11147
11148
11149
11150
11151 se->depth = se->parent ? se->parent->depth + 1 : 0;
11152#endif
11153
11154
11155 update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
11156 attach_entity_load_avg(cfs_rq, se);
11157 update_tg_load_avg(cfs_rq);
11158 propagate_entity_cfs_rq(se);
11159}
11160
11161static void detach_task_cfs_rq(struct task_struct *p)
11162{
11163 struct sched_entity *se = &p->se;
11164 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11165
11166 if (!vruntime_normalized(p)) {
11167
11168
11169
11170
11171 place_entity(cfs_rq, se, 0);
11172 se->vruntime -= cfs_rq->min_vruntime;
11173 }
11174
11175 detach_entity_cfs_rq(se);
11176}
11177
11178static void attach_task_cfs_rq(struct task_struct *p)
11179{
11180 struct sched_entity *se = &p->se;
11181 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11182
11183 attach_entity_cfs_rq(se);
11184
11185 if (!vruntime_normalized(p))
11186 se->vruntime += cfs_rq->min_vruntime;
11187}
11188
11189static void switched_from_fair(struct rq *rq, struct task_struct *p)
11190{
11191 detach_task_cfs_rq(p);
11192}
11193
11194static void switched_to_fair(struct rq *rq, struct task_struct *p)
11195{
11196 attach_task_cfs_rq(p);
11197
11198 if (task_on_rq_queued(p)) {
11199
11200
11201
11202
11203
11204 if (task_current(rq, p))
11205 resched_curr(rq);
11206 else
11207 check_preempt_curr(rq, p, 0);
11208 }
11209}
11210
11211
11212
11213
11214
11215
11216static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
11217{
11218 struct sched_entity *se = &p->se;
11219
11220#ifdef CONFIG_SMP
11221 if (task_on_rq_queued(p)) {
11222
11223
11224
11225
11226 list_move(&se->group_node, &rq->cfs_tasks);
11227 }
11228#endif
11229
11230 for_each_sched_entity(se) {
11231 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11232
11233 set_next_entity(cfs_rq, se);
11234
11235 account_cfs_rq_runtime(cfs_rq, 0);
11236 }
11237}
11238
11239void init_cfs_rq(struct cfs_rq *cfs_rq)
11240{
11241 cfs_rq->tasks_timeline = RB_ROOT_CACHED;
11242 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
11243#ifndef CONFIG_64BIT
11244 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
11245#endif
11246#ifdef CONFIG_SMP
11247 raw_spin_lock_init(&cfs_rq->removed.lock);
11248#endif
11249}
11250
11251#ifdef CONFIG_FAIR_GROUP_SCHED
11252static void task_set_group_fair(struct task_struct *p)
11253{
11254 struct sched_entity *se = &p->se;
11255
11256 set_task_rq(p, task_cpu(p));
11257 se->depth = se->parent ? se->parent->depth + 1 : 0;
11258}
11259
11260static void task_move_group_fair(struct task_struct *p)
11261{
11262 detach_task_cfs_rq(p);
11263 set_task_rq(p, task_cpu(p));
11264
11265#ifdef CONFIG_SMP
11266
11267 p->se.avg.last_update_time = 0;
11268#endif
11269 attach_task_cfs_rq(p);
11270}
11271
11272static void task_change_group_fair(struct task_struct *p, int type)
11273{
11274 switch (type) {
11275 case TASK_SET_GROUP:
11276 task_set_group_fair(p);
11277 break;
11278
11279 case TASK_MOVE_GROUP:
11280 task_move_group_fair(p);
11281 break;
11282 }
11283}
11284
11285void free_fair_sched_group(struct task_group *tg)
11286{
11287 int i;
11288
11289 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
11290
11291 for_each_possible_cpu(i) {
11292 if (tg->cfs_rq)
11293 kfree(tg->cfs_rq[i]);
11294 if (tg->se)
11295 kfree(tg->se[i]);
11296 }
11297
11298 kfree(tg->cfs_rq);
11299 kfree(tg->se);
11300}
11301
11302int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
11303{
11304 struct sched_entity *se;
11305 struct cfs_rq *cfs_rq;
11306 int i;
11307
11308 tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
11309 if (!tg->cfs_rq)
11310 goto err;
11311 tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
11312 if (!tg->se)
11313 goto err;
11314
11315 tg->shares = NICE_0_LOAD;
11316
11317 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
11318
11319 for_each_possible_cpu(i) {
11320 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
11321 GFP_KERNEL, cpu_to_node(i));
11322 if (!cfs_rq)
11323 goto err;
11324
11325 se = kzalloc_node(sizeof(struct sched_entity),
11326 GFP_KERNEL, cpu_to_node(i));
11327 if (!se)
11328 goto err_free_rq;
11329
11330 init_cfs_rq(cfs_rq);
11331 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
11332 init_entity_runnable_average(se);
11333 }
11334
11335 return 1;
11336
11337err_free_rq:
11338 kfree(cfs_rq);
11339err:
11340 return 0;
11341}
11342
11343void online_fair_sched_group(struct task_group *tg)
11344{
11345 struct sched_entity *se;
11346 struct rq_flags rf;
11347 struct rq *rq;
11348 int i;
11349
11350 for_each_possible_cpu(i) {
11351 rq = cpu_rq(i);
11352 se = tg->se[i];
11353 rq_lock_irq(rq, &rf);
11354 update_rq_clock(rq);
11355 attach_entity_cfs_rq(se);
11356 sync_throttle(tg, i);
11357 rq_unlock_irq(rq, &rf);
11358 }
11359}
11360
11361void unregister_fair_sched_group(struct task_group *tg)
11362{
11363 unsigned long flags;
11364 struct rq *rq;
11365 int cpu;
11366
11367 for_each_possible_cpu(cpu) {
11368 if (tg->se[cpu])
11369 remove_entity_load_avg(tg->se[cpu]);
11370
11371
11372
11373
11374
11375 if (!tg->cfs_rq[cpu]->on_list)
11376 continue;
11377
11378 rq = cpu_rq(cpu);
11379
11380 raw_spin_rq_lock_irqsave(rq, flags);
11381 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
11382 raw_spin_rq_unlock_irqrestore(rq, flags);
11383 }
11384}
11385
11386void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
11387 struct sched_entity *se, int cpu,
11388 struct sched_entity *parent)
11389{
11390 struct rq *rq = cpu_rq(cpu);
11391
11392 cfs_rq->tg = tg;
11393 cfs_rq->rq = rq;
11394 init_cfs_rq_runtime(cfs_rq);
11395
11396 tg->cfs_rq[cpu] = cfs_rq;
11397 tg->se[cpu] = se;
11398
11399
11400 if (!se)
11401 return;
11402
11403 if (!parent) {
11404 se->cfs_rq = &rq->cfs;
11405 se->depth = 0;
11406 } else {
11407 se->cfs_rq = parent->my_q;
11408 se->depth = parent->depth + 1;
11409 }
11410
11411 se->my_q = cfs_rq;
11412
11413 update_load_set(&se->load, NICE_0_LOAD);
11414 se->parent = parent;
11415}
11416
11417static DEFINE_MUTEX(shares_mutex);
11418
11419int sched_group_set_shares(struct task_group *tg, unsigned long shares)
11420{
11421 int i;
11422
11423
11424
11425
11426 if (!tg->se[0])
11427 return -EINVAL;
11428
11429 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
11430
11431 mutex_lock(&shares_mutex);
11432 if (tg->shares == shares)
11433 goto done;
11434
11435 tg->shares = shares;
11436 for_each_possible_cpu(i) {
11437 struct rq *rq = cpu_rq(i);
11438 struct sched_entity *se = tg->se[i];
11439 struct rq_flags rf;
11440
11441
11442 rq_lock_irqsave(rq, &rf);
11443 update_rq_clock(rq);
11444 for_each_sched_entity(se) {
11445 update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
11446 update_cfs_group(se);
11447 }
11448 rq_unlock_irqrestore(rq, &rf);
11449 }
11450
11451done:
11452 mutex_unlock(&shares_mutex);
11453 return 0;
11454}
11455#else
11456
11457void free_fair_sched_group(struct task_group *tg) { }
11458
11459int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
11460{
11461 return 1;
11462}
11463
11464void online_fair_sched_group(struct task_group *tg) { }
11465
11466void unregister_fair_sched_group(struct task_group *tg) { }
11467
11468#endif
11469
11470
11471static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
11472{
11473 struct sched_entity *se = &task->se;
11474 unsigned int rr_interval = 0;
11475
11476
11477
11478
11479
11480 if (rq->cfs.load.weight)
11481 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
11482
11483 return rr_interval;
11484}
11485
11486
11487
11488
11489DEFINE_SCHED_CLASS(fair) = {
11490
11491 .enqueue_task = enqueue_task_fair,
11492 .dequeue_task = dequeue_task_fair,
11493 .yield_task = yield_task_fair,
11494 .yield_to_task = yield_to_task_fair,
11495
11496 .check_preempt_curr = check_preempt_wakeup,
11497
11498 .pick_next_task = __pick_next_task_fair,
11499 .put_prev_task = put_prev_task_fair,
11500 .set_next_task = set_next_task_fair,
11501
11502#ifdef CONFIG_SMP
11503 .balance = balance_fair,
11504 .pick_task = pick_task_fair,
11505 .select_task_rq = select_task_rq_fair,
11506 .migrate_task_rq = migrate_task_rq_fair,
11507
11508 .rq_online = rq_online_fair,
11509 .rq_offline = rq_offline_fair,
11510
11511 .task_dead = task_dead_fair,
11512 .set_cpus_allowed = set_cpus_allowed_common,
11513#endif
11514
11515 .task_tick = task_tick_fair,
11516 .task_fork = task_fork_fair,
11517
11518 .prio_changed = prio_changed_fair,
11519 .switched_from = switched_from_fair,
11520 .switched_to = switched_to_fair,
11521
11522 .get_rr_interval = get_rr_interval_fair,
11523
11524 .update_curr = update_curr_fair,
11525
11526#ifdef CONFIG_FAIR_GROUP_SCHED
11527 .task_change_group = task_change_group_fair,
11528#endif
11529
11530#ifdef CONFIG_UCLAMP_TASK
11531 .uclamp_enabled = 1,
11532#endif
11533};
11534
11535#ifdef CONFIG_SCHED_DEBUG
11536void print_cfs_stats(struct seq_file *m, int cpu)
11537{
11538 struct cfs_rq *cfs_rq, *pos;
11539
11540 rcu_read_lock();
11541 for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
11542 print_cfs_rq(m, cpu, cfs_rq);
11543 rcu_read_unlock();
11544}
11545
11546#ifdef CONFIG_NUMA_BALANCING
11547void show_numa_stats(struct task_struct *p, struct seq_file *m)
11548{
11549 int node;
11550 unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
11551 struct numa_group *ng;
11552
11553 rcu_read_lock();
11554 ng = rcu_dereference(p->numa_group);
11555 for_each_online_node(node) {
11556 if (p->numa_faults) {
11557 tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
11558 tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
11559 }
11560 if (ng) {
11561 gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
11562 gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
11563 }
11564 print_numa_stats(m, node, tsf, tpf, gsf, gpf);
11565 }
11566 rcu_read_unlock();
11567}
11568#endif
11569#endif
11570
11571__init void init_sched_fair_class(void)
11572{
11573#ifdef CONFIG_SMP
11574 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
11575
11576#ifdef CONFIG_NO_HZ_COMMON
11577 nohz.next_balance = jiffies;
11578 nohz.next_blocked = jiffies;
11579 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
11580#endif
11581#endif
11582
11583}
11584
11585
11586
11587
11588
11589const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
11590{
11591#ifdef CONFIG_SMP
11592 return cfs_rq ? &cfs_rq->avg : NULL;
11593#else
11594 return NULL;
11595#endif
11596}
11597EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
11598
11599char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
11600{
11601 if (!cfs_rq) {
11602 if (str)
11603 strlcpy(str, "(null)", len);
11604 else
11605 return NULL;
11606 }
11607
11608 cfs_rq_tg_path(cfs_rq, str, len);
11609 return str;
11610}
11611EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
11612
11613int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
11614{
11615 return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
11616}
11617EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
11618
11619const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
11620{
11621#ifdef CONFIG_SMP
11622 return rq ? &rq->avg_rt : NULL;
11623#else
11624 return NULL;
11625#endif
11626}
11627EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
11628
11629const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
11630{
11631#ifdef CONFIG_SMP
11632 return rq ? &rq->avg_dl : NULL;
11633#else
11634 return NULL;
11635#endif
11636}
11637EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
11638
11639const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
11640{
11641#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
11642 return rq ? &rq->avg_irq : NULL;
11643#else
11644 return NULL;
11645#endif
11646}
11647EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
11648
11649int sched_trace_rq_cpu(struct rq *rq)
11650{
11651 return rq ? cpu_of(rq) : -1;
11652}
11653EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
11654
11655int sched_trace_rq_cpu_capacity(struct rq *rq)
11656{
11657 return rq ?
11658#ifdef CONFIG_SMP
11659 rq->cpu_capacity
11660#else
11661 SCHED_CAPACITY_SCALE
11662#endif
11663 : -1;
11664}
11665EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity);
11666
11667const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
11668{
11669#ifdef CONFIG_SMP
11670 return rd ? rd->span : NULL;
11671#else
11672 return NULL;
11673#endif
11674}
11675EXPORT_SYMBOL_GPL(sched_trace_rd_span);
11676
11677int sched_trace_rq_nr_running(struct rq *rq)
11678{
11679 return rq ? rq->nr_running : -1;
11680}
11681EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);
11682