1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23#include "sched.h"
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38unsigned int sysctl_sched_latency = 6000000ULL;
39static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
40
41
42
43
44
45
46
47
48
49
50
51
52unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
53
54
55
56
57
58
59unsigned int sysctl_sched_min_granularity = 750000ULL;
60static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
61
62
63
64
65static unsigned int sched_nr_latency = 8;
66
67
68
69
70
71unsigned int sysctl_sched_child_runs_first __read_mostly;
72
73
74
75
76
77
78
79
80
81
82unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
83static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
84
85const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
86
87int sched_thermal_decay_shift;
88static int __init setup_sched_thermal_decay_shift(char *str)
89{
90 int _shift = 0;
91
92 if (kstrtoint(str, 0, &_shift))
93 pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
94
95 sched_thermal_decay_shift = clamp(_shift, 0, 10);
96 return 1;
97}
98__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
99
100#ifdef CONFIG_SMP
101
102
103
104int __weak arch_asym_cpu_priority(int cpu)
105{
106 return -cpu;
107}
108
109
110
111
112
113
114#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
115
116
117
118
119
120
121
122#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
123#endif
124
125#ifdef CONFIG_CFS_BANDWIDTH
126
127
128
129
130
131
132
133
134
135
136unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
137#endif
138
139static inline void update_load_add(struct load_weight *lw, unsigned long inc)
140{
141 lw->weight += inc;
142 lw->inv_weight = 0;
143}
144
145static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
146{
147 lw->weight -= dec;
148 lw->inv_weight = 0;
149}
150
151static inline void update_load_set(struct load_weight *lw, unsigned long w)
152{
153 lw->weight = w;
154 lw->inv_weight = 0;
155}
156
157
158
159
160
161
162
163
164
165
166static unsigned int get_update_sysctl_factor(void)
167{
168 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
169 unsigned int factor;
170
171 switch (sysctl_sched_tunable_scaling) {
172 case SCHED_TUNABLESCALING_NONE:
173 factor = 1;
174 break;
175 case SCHED_TUNABLESCALING_LINEAR:
176 factor = cpus;
177 break;
178 case SCHED_TUNABLESCALING_LOG:
179 default:
180 factor = 1 + ilog2(cpus);
181 break;
182 }
183
184 return factor;
185}
186
187static void update_sysctl(void)
188{
189 unsigned int factor = get_update_sysctl_factor();
190
191#define SET_SYSCTL(name) \
192 (sysctl_##name = (factor) * normalized_sysctl_##name)
193 SET_SYSCTL(sched_min_granularity);
194 SET_SYSCTL(sched_latency);
195 SET_SYSCTL(sched_wakeup_granularity);
196#undef SET_SYSCTL
197}
198
199void __init sched_init_granularity(void)
200{
201 update_sysctl();
202}
203
204#define WMULT_CONST (~0U)
205#define WMULT_SHIFT 32
206
207static void __update_inv_weight(struct load_weight *lw)
208{
209 unsigned long w;
210
211 if (likely(lw->inv_weight))
212 return;
213
214 w = scale_load_down(lw->weight);
215
216 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
217 lw->inv_weight = 1;
218 else if (unlikely(!w))
219 lw->inv_weight = WMULT_CONST;
220 else
221 lw->inv_weight = WMULT_CONST / w;
222}
223
224
225
226
227
228
229
230
231
232
233
234
235
236static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
237{
238 u64 fact = scale_load_down(weight);
239 u32 fact_hi = (u32)(fact >> 32);
240 int shift = WMULT_SHIFT;
241 int fs;
242
243 __update_inv_weight(lw);
244
245 if (unlikely(fact_hi)) {
246 fs = fls(fact_hi);
247 shift -= fs;
248 fact >>= fs;
249 }
250
251 fact = mul_u32_u32(fact, lw->inv_weight);
252
253 fact_hi = (u32)(fact >> 32);
254 if (fact_hi) {
255 fs = fls(fact_hi);
256 shift -= fs;
257 fact >>= fs;
258 }
259
260 return mul_u64_u32_shr(delta_exec, fact, shift);
261}
262
263
264const struct sched_class fair_sched_class;
265
266
267
268
269
270#ifdef CONFIG_FAIR_GROUP_SCHED
271
272
273#define for_each_sched_entity(se) \
274 for (; se; se = se->parent)
275
276static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
277{
278 if (!path)
279 return;
280
281 if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
282 autogroup_path(cfs_rq->tg, path, len);
283 else if (cfs_rq && cfs_rq->tg->css.cgroup)
284 cgroup_path(cfs_rq->tg->css.cgroup, path, len);
285 else
286 strlcpy(path, "(null)", len);
287}
288
289static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
290{
291 struct rq *rq = rq_of(cfs_rq);
292 int cpu = cpu_of(rq);
293
294 if (cfs_rq->on_list)
295 return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
296
297 cfs_rq->on_list = 1;
298
299
300
301
302
303
304
305
306
307
308 if (cfs_rq->tg->parent &&
309 cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
310
311
312
313
314
315
316 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
317 &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
318
319
320
321
322
323 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
324 return true;
325 }
326
327 if (!cfs_rq->tg->parent) {
328
329
330
331
332 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
333 &rq->leaf_cfs_rq_list);
334
335
336
337
338 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
339 return true;
340 }
341
342
343
344
345
346
347
348 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
349
350
351
352
353 rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
354 return false;
355}
356
357static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
358{
359 if (cfs_rq->on_list) {
360 struct rq *rq = rq_of(cfs_rq);
361
362
363
364
365
366
367
368
369 if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
370 rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
371
372 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
373 cfs_rq->on_list = 0;
374 }
375}
376
377static inline void assert_list_leaf_cfs_rq(struct rq *rq)
378{
379 SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
380}
381
382
383#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
384 list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
385 leaf_cfs_rq_list)
386
387
388static inline struct cfs_rq *
389is_same_group(struct sched_entity *se, struct sched_entity *pse)
390{
391 if (se->cfs_rq == pse->cfs_rq)
392 return se->cfs_rq;
393
394 return NULL;
395}
396
397static inline struct sched_entity *parent_entity(struct sched_entity *se)
398{
399 return se->parent;
400}
401
402static void
403find_matching_se(struct sched_entity **se, struct sched_entity **pse)
404{
405 int se_depth, pse_depth;
406
407
408
409
410
411
412
413
414
415 se_depth = (*se)->depth;
416 pse_depth = (*pse)->depth;
417
418 while (se_depth > pse_depth) {
419 se_depth--;
420 *se = parent_entity(*se);
421 }
422
423 while (pse_depth > se_depth) {
424 pse_depth--;
425 *pse = parent_entity(*pse);
426 }
427
428 while (!is_same_group(*se, *pse)) {
429 *se = parent_entity(*se);
430 *pse = parent_entity(*pse);
431 }
432}
433
434static int tg_is_idle(struct task_group *tg)
435{
436 return tg->idle > 0;
437}
438
439static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
440{
441 return cfs_rq->idle > 0;
442}
443
444static int se_is_idle(struct sched_entity *se)
445{
446 if (entity_is_task(se))
447 return task_has_idle_policy(task_of(se));
448 return cfs_rq_is_idle(group_cfs_rq(se));
449}
450
451#else
452
453#define for_each_sched_entity(se) \
454 for (; se; se = NULL)
455
456static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
457{
458 if (path)
459 strlcpy(path, "(null)", len);
460}
461
462static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
463{
464 return true;
465}
466
467static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
468{
469}
470
471static inline void assert_list_leaf_cfs_rq(struct rq *rq)
472{
473}
474
475#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
476 for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
477
478static inline struct sched_entity *parent_entity(struct sched_entity *se)
479{
480 return NULL;
481}
482
483static inline void
484find_matching_se(struct sched_entity **se, struct sched_entity **pse)
485{
486}
487
488static inline int tg_is_idle(struct task_group *tg)
489{
490 return 0;
491}
492
493static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
494{
495 return 0;
496}
497
498static int se_is_idle(struct sched_entity *se)
499{
500 return 0;
501}
502
503#endif
504
505static __always_inline
506void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
507
508
509
510
511
512static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
513{
514 s64 delta = (s64)(vruntime - max_vruntime);
515 if (delta > 0)
516 max_vruntime = vruntime;
517
518 return max_vruntime;
519}
520
521static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
522{
523 s64 delta = (s64)(vruntime - min_vruntime);
524 if (delta < 0)
525 min_vruntime = vruntime;
526
527 return min_vruntime;
528}
529
530static inline bool entity_before(struct sched_entity *a,
531 struct sched_entity *b)
532{
533 return (s64)(a->vruntime - b->vruntime) < 0;
534}
535
536#define __node_2_se(node) \
537 rb_entry((node), struct sched_entity, run_node)
538
539static void update_min_vruntime(struct cfs_rq *cfs_rq)
540{
541 struct sched_entity *curr = cfs_rq->curr;
542 struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
543
544 u64 vruntime = cfs_rq->min_vruntime;
545
546 if (curr) {
547 if (curr->on_rq)
548 vruntime = curr->vruntime;
549 else
550 curr = NULL;
551 }
552
553 if (leftmost) {
554 struct sched_entity *se = __node_2_se(leftmost);
555
556 if (!curr)
557 vruntime = se->vruntime;
558 else
559 vruntime = min_vruntime(vruntime, se->vruntime);
560 }
561
562
563 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
564#ifndef CONFIG_64BIT
565 smp_wmb();
566 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
567#endif
568}
569
570static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
571{
572 return entity_before(__node_2_se(a), __node_2_se(b));
573}
574
575
576
577
578static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
579{
580 rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
581}
582
583static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
584{
585 rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
586}
587
588struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
589{
590 struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
591
592 if (!left)
593 return NULL;
594
595 return __node_2_se(left);
596}
597
598static struct sched_entity *__pick_next_entity(struct sched_entity *se)
599{
600 struct rb_node *next = rb_next(&se->run_node);
601
602 if (!next)
603 return NULL;
604
605 return __node_2_se(next);
606}
607
608#ifdef CONFIG_SCHED_DEBUG
609struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
610{
611 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
612
613 if (!last)
614 return NULL;
615
616 return __node_2_se(last);
617}
618
619
620
621
622
623int sched_update_scaling(void)
624{
625 unsigned int factor = get_update_sysctl_factor();
626
627 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
628 sysctl_sched_min_granularity);
629
630#define WRT_SYSCTL(name) \
631 (normalized_sysctl_##name = sysctl_##name / (factor))
632 WRT_SYSCTL(sched_min_granularity);
633 WRT_SYSCTL(sched_latency);
634 WRT_SYSCTL(sched_wakeup_granularity);
635#undef WRT_SYSCTL
636
637 return 0;
638}
639#endif
640
641
642
643
644static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
645{
646 if (unlikely(se->load.weight != NICE_0_LOAD))
647 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
648
649 return delta;
650}
651
652
653
654
655
656
657
658
659
660static u64 __sched_period(unsigned long nr_running)
661{
662 if (unlikely(nr_running > sched_nr_latency))
663 return nr_running * sysctl_sched_min_granularity;
664 else
665 return sysctl_sched_latency;
666}
667
668
669
670
671
672
673
674static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
675{
676 unsigned int nr_running = cfs_rq->nr_running;
677 u64 slice;
678
679 if (sched_feat(ALT_PERIOD))
680 nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
681
682 slice = __sched_period(nr_running + !se->on_rq);
683
684 for_each_sched_entity(se) {
685 struct load_weight *load;
686 struct load_weight lw;
687
688 cfs_rq = cfs_rq_of(se);
689 load = &cfs_rq->load;
690
691 if (unlikely(!se->on_rq)) {
692 lw = cfs_rq->load;
693
694 update_load_add(&lw, se->load.weight);
695 load = &lw;
696 }
697 slice = __calc_delta(slice, se->load.weight, load);
698 }
699
700 if (sched_feat(BASE_SLICE))
701 slice = max(slice, (u64)sysctl_sched_min_granularity);
702
703 return slice;
704}
705
706
707
708
709
710
711static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
712{
713 return calc_delta_fair(sched_slice(cfs_rq, se), se);
714}
715
716#include "pelt.h"
717#ifdef CONFIG_SMP
718
719static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
720static unsigned long task_h_load(struct task_struct *p);
721static unsigned long capacity_of(int cpu);
722
723
724void init_entity_runnable_average(struct sched_entity *se)
725{
726 struct sched_avg *sa = &se->avg;
727
728 memset(sa, 0, sizeof(*sa));
729
730
731
732
733
734
735
736 if (entity_is_task(se))
737 sa->load_avg = scale_load_down(se->load.weight);
738
739
740}
741
742static void attach_entity_cfs_rq(struct sched_entity *se);
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770void post_init_entity_util_avg(struct task_struct *p)
771{
772 struct sched_entity *se = &p->se;
773 struct cfs_rq *cfs_rq = cfs_rq_of(se);
774 struct sched_avg *sa = &se->avg;
775 long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
776 long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
777
778 if (cap > 0) {
779 if (cfs_rq->avg.util_avg != 0) {
780 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
781 sa->util_avg /= (cfs_rq->avg.load_avg + 1);
782
783 if (sa->util_avg > cap)
784 sa->util_avg = cap;
785 } else {
786 sa->util_avg = cap;
787 }
788 }
789
790 sa->runnable_avg = sa->util_avg;
791
792 if (p->sched_class != &fair_sched_class) {
793
794
795
796
797
798
799
800
801
802
803 se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
804 return;
805 }
806
807 attach_entity_cfs_rq(se);
808}
809
810#else
811void init_entity_runnable_average(struct sched_entity *se)
812{
813}
814void post_init_entity_util_avg(struct task_struct *p)
815{
816}
817static void update_tg_load_avg(struct cfs_rq *cfs_rq)
818{
819}
820#endif
821
822
823
824
825static void update_curr(struct cfs_rq *cfs_rq)
826{
827 struct sched_entity *curr = cfs_rq->curr;
828 u64 now = rq_clock_task(rq_of(cfs_rq));
829 u64 delta_exec;
830
831 if (unlikely(!curr))
832 return;
833
834 delta_exec = now - curr->exec_start;
835 if (unlikely((s64)delta_exec <= 0))
836 return;
837
838 curr->exec_start = now;
839
840 schedstat_set(curr->statistics.exec_max,
841 max(delta_exec, curr->statistics.exec_max));
842
843 curr->sum_exec_runtime += delta_exec;
844 schedstat_add(cfs_rq->exec_clock, delta_exec);
845
846 curr->vruntime += calc_delta_fair(delta_exec, curr);
847 update_min_vruntime(cfs_rq);
848
849 if (entity_is_task(curr)) {
850 struct task_struct *curtask = task_of(curr);
851
852 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
853 cgroup_account_cputime(curtask, delta_exec);
854 account_group_exec_runtime(curtask, delta_exec);
855 }
856
857 account_cfs_rq_runtime(cfs_rq, delta_exec);
858}
859
860static void update_curr_fair(struct rq *rq)
861{
862 update_curr(cfs_rq_of(&rq->curr->se));
863}
864
865static inline void
866update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
867{
868 u64 wait_start, prev_wait_start;
869
870 if (!schedstat_enabled())
871 return;
872
873 wait_start = rq_clock(rq_of(cfs_rq));
874 prev_wait_start = schedstat_val(se->statistics.wait_start);
875
876 if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
877 likely(wait_start > prev_wait_start))
878 wait_start -= prev_wait_start;
879
880 __schedstat_set(se->statistics.wait_start, wait_start);
881}
882
883static inline void
884update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
885{
886 struct task_struct *p;
887 u64 delta;
888
889 if (!schedstat_enabled())
890 return;
891
892
893
894
895
896
897
898 if (unlikely(!schedstat_val(se->statistics.wait_start)))
899 return;
900
901 delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
902
903 if (entity_is_task(se)) {
904 p = task_of(se);
905 if (task_on_rq_migrating(p)) {
906
907
908
909
910
911 __schedstat_set(se->statistics.wait_start, delta);
912 return;
913 }
914 trace_sched_stat_wait(p, delta);
915 }
916
917 __schedstat_set(se->statistics.wait_max,
918 max(schedstat_val(se->statistics.wait_max), delta));
919 __schedstat_inc(se->statistics.wait_count);
920 __schedstat_add(se->statistics.wait_sum, delta);
921 __schedstat_set(se->statistics.wait_start, 0);
922}
923
924static inline void
925update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
926{
927 struct task_struct *tsk = NULL;
928 u64 sleep_start, block_start;
929
930 if (!schedstat_enabled())
931 return;
932
933 sleep_start = schedstat_val(se->statistics.sleep_start);
934 block_start = schedstat_val(se->statistics.block_start);
935
936 if (entity_is_task(se))
937 tsk = task_of(se);
938
939 if (sleep_start) {
940 u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
941
942 if ((s64)delta < 0)
943 delta = 0;
944
945 if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
946 __schedstat_set(se->statistics.sleep_max, delta);
947
948 __schedstat_set(se->statistics.sleep_start, 0);
949 __schedstat_add(se->statistics.sum_sleep_runtime, delta);
950
951 if (tsk) {
952 account_scheduler_latency(tsk, delta >> 10, 1);
953 trace_sched_stat_sleep(tsk, delta);
954 }
955 }
956 if (block_start) {
957 u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
958
959 if ((s64)delta < 0)
960 delta = 0;
961
962 if (unlikely(delta > schedstat_val(se->statistics.block_max)))
963 __schedstat_set(se->statistics.block_max, delta);
964
965 __schedstat_set(se->statistics.block_start, 0);
966 __schedstat_add(se->statistics.sum_sleep_runtime, delta);
967
968 if (tsk) {
969 if (tsk->in_iowait) {
970 __schedstat_add(se->statistics.iowait_sum, delta);
971 __schedstat_inc(se->statistics.iowait_count);
972 trace_sched_stat_iowait(tsk, delta);
973 }
974
975 trace_sched_stat_blocked(tsk, delta);
976
977
978
979
980
981
982 if (unlikely(prof_on == SLEEP_PROFILING)) {
983 profile_hits(SLEEP_PROFILING,
984 (void *)get_wchan(tsk),
985 delta >> 20);
986 }
987 account_scheduler_latency(tsk, delta >> 10, 0);
988 }
989 }
990}
991
992
993
994
995static inline void
996update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
997{
998 if (!schedstat_enabled())
999 return;
1000
1001
1002
1003
1004
1005 if (se != cfs_rq->curr)
1006 update_stats_wait_start(cfs_rq, se);
1007
1008 if (flags & ENQUEUE_WAKEUP)
1009 update_stats_enqueue_sleeper(cfs_rq, se);
1010}
1011
1012static inline void
1013update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1014{
1015
1016 if (!schedstat_enabled())
1017 return;
1018
1019
1020
1021
1022
1023 if (se != cfs_rq->curr)
1024 update_stats_wait_end(cfs_rq, se);
1025
1026 if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
1027 struct task_struct *tsk = task_of(se);
1028 unsigned int state;
1029
1030
1031 state = READ_ONCE(tsk->__state);
1032 if (state & TASK_INTERRUPTIBLE)
1033 __schedstat_set(se->statistics.sleep_start,
1034 rq_clock(rq_of(cfs_rq)));
1035 if (state & TASK_UNINTERRUPTIBLE)
1036 __schedstat_set(se->statistics.block_start,
1037 rq_clock(rq_of(cfs_rq)));
1038 }
1039}
1040
1041
1042
1043
1044static inline void
1045update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
1046{
1047
1048
1049
1050 se->exec_start = rq_clock_task(rq_of(cfs_rq));
1051}
1052
1053
1054
1055
1056
1057#ifdef CONFIG_NUMA_BALANCING
1058
1059
1060
1061
1062
1063unsigned int sysctl_numa_balancing_scan_period_min = 1000;
1064unsigned int sysctl_numa_balancing_scan_period_max = 60000;
1065
1066
1067unsigned int sysctl_numa_balancing_scan_size = 256;
1068
1069
1070unsigned int sysctl_numa_balancing_scan_delay = 1000;
1071
1072struct numa_group {
1073 refcount_t refcount;
1074
1075 spinlock_t lock;
1076 int nr_tasks;
1077 pid_t gid;
1078 int active_nodes;
1079
1080 struct rcu_head rcu;
1081 unsigned long total_faults;
1082 unsigned long max_faults_cpu;
1083
1084
1085
1086
1087
1088 unsigned long *faults_cpu;
1089 unsigned long faults[];
1090};
1091
1092
1093
1094
1095
1096static struct numa_group *deref_task_numa_group(struct task_struct *p)
1097{
1098 return rcu_dereference_check(p->numa_group, p == current ||
1099 (lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu)));
1100}
1101
1102static struct numa_group *deref_curr_numa_group(struct task_struct *p)
1103{
1104 return rcu_dereference_protected(p->numa_group, p == current);
1105}
1106
1107static inline unsigned long group_faults_priv(struct numa_group *ng);
1108static inline unsigned long group_faults_shared(struct numa_group *ng);
1109
1110static unsigned int task_nr_scan_windows(struct task_struct *p)
1111{
1112 unsigned long rss = 0;
1113 unsigned long nr_scan_pages;
1114
1115
1116
1117
1118
1119
1120 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1121 rss = get_mm_rss(p->mm);
1122 if (!rss)
1123 rss = nr_scan_pages;
1124
1125 rss = round_up(rss, nr_scan_pages);
1126 return rss / nr_scan_pages;
1127}
1128
1129
1130#define MAX_SCAN_WINDOW 2560
1131
1132static unsigned int task_scan_min(struct task_struct *p)
1133{
1134 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
1135 unsigned int scan, floor;
1136 unsigned int windows = 1;
1137
1138 if (scan_size < MAX_SCAN_WINDOW)
1139 windows = MAX_SCAN_WINDOW / scan_size;
1140 floor = 1000 / windows;
1141
1142 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1143 return max_t(unsigned int, floor, scan);
1144}
1145
1146static unsigned int task_scan_start(struct task_struct *p)
1147{
1148 unsigned long smin = task_scan_min(p);
1149 unsigned long period = smin;
1150 struct numa_group *ng;
1151
1152
1153 rcu_read_lock();
1154 ng = rcu_dereference(p->numa_group);
1155 if (ng) {
1156 unsigned long shared = group_faults_shared(ng);
1157 unsigned long private = group_faults_priv(ng);
1158
1159 period *= refcount_read(&ng->refcount);
1160 period *= shared + 1;
1161 period /= private + shared + 1;
1162 }
1163 rcu_read_unlock();
1164
1165 return max(smin, period);
1166}
1167
1168static unsigned int task_scan_max(struct task_struct *p)
1169{
1170 unsigned long smin = task_scan_min(p);
1171 unsigned long smax;
1172 struct numa_group *ng;
1173
1174
1175 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1176
1177
1178 ng = deref_curr_numa_group(p);
1179 if (ng) {
1180 unsigned long shared = group_faults_shared(ng);
1181 unsigned long private = group_faults_priv(ng);
1182 unsigned long period = smax;
1183
1184 period *= refcount_read(&ng->refcount);
1185 period *= shared + 1;
1186 period /= private + shared + 1;
1187
1188 smax = max(smax, period);
1189 }
1190
1191 return max(smin, smax);
1192}
1193
1194static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1195{
1196 rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
1197 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1198}
1199
1200static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1201{
1202 rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
1203 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1204}
1205
1206
1207#define NR_NUMA_HINT_FAULT_TYPES 2
1208
1209
1210#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1211
1212
1213#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1214
1215pid_t task_numa_group_id(struct task_struct *p)
1216{
1217 struct numa_group *ng;
1218 pid_t gid = 0;
1219
1220 rcu_read_lock();
1221 ng = rcu_dereference(p->numa_group);
1222 if (ng)
1223 gid = ng->gid;
1224 rcu_read_unlock();
1225
1226 return gid;
1227}
1228
1229
1230
1231
1232
1233
1234
1235static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1236{
1237 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1238}
1239
1240static inline unsigned long task_faults(struct task_struct *p, int nid)
1241{
1242 if (!p->numa_faults)
1243 return 0;
1244
1245 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1246 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1247}
1248
1249static inline unsigned long group_faults(struct task_struct *p, int nid)
1250{
1251 struct numa_group *ng = deref_task_numa_group(p);
1252
1253 if (!ng)
1254 return 0;
1255
1256 return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1257 ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1258}
1259
1260static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1261{
1262 return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1263 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
1264}
1265
1266static inline unsigned long group_faults_priv(struct numa_group *ng)
1267{
1268 unsigned long faults = 0;
1269 int node;
1270
1271 for_each_online_node(node) {
1272 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
1273 }
1274
1275 return faults;
1276}
1277
1278static inline unsigned long group_faults_shared(struct numa_group *ng)
1279{
1280 unsigned long faults = 0;
1281 int node;
1282
1283 for_each_online_node(node) {
1284 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
1285 }
1286
1287 return faults;
1288}
1289
1290
1291
1292
1293
1294
1295#define ACTIVE_NODE_FRACTION 3
1296
1297static bool numa_is_active_node(int nid, struct numa_group *ng)
1298{
1299 return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1300}
1301
1302
1303static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1304 int maxdist, bool task)
1305{
1306 unsigned long score = 0;
1307 int node;
1308
1309
1310
1311
1312
1313 if (sched_numa_topology_type == NUMA_DIRECT)
1314 return 0;
1315
1316
1317
1318
1319
1320 for_each_online_node(node) {
1321 unsigned long faults;
1322 int dist = node_distance(nid, node);
1323
1324
1325
1326
1327
1328 if (dist == sched_max_numa_distance || node == nid)
1329 continue;
1330
1331
1332
1333
1334
1335
1336
1337
1338 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1339 dist >= maxdist)
1340 continue;
1341
1342
1343 if (task)
1344 faults = task_faults(p, node);
1345 else
1346 faults = group_faults(p, node);
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1357 faults *= (sched_max_numa_distance - dist);
1358 faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1359 }
1360
1361 score += faults;
1362 }
1363
1364 return score;
1365}
1366
1367
1368
1369
1370
1371
1372
1373static inline unsigned long task_weight(struct task_struct *p, int nid,
1374 int dist)
1375{
1376 unsigned long faults, total_faults;
1377
1378 if (!p->numa_faults)
1379 return 0;
1380
1381 total_faults = p->total_numa_faults;
1382
1383 if (!total_faults)
1384 return 0;
1385
1386 faults = task_faults(p, nid);
1387 faults += score_nearby_nodes(p, nid, dist, true);
1388
1389 return 1000 * faults / total_faults;
1390}
1391
1392static inline unsigned long group_weight(struct task_struct *p, int nid,
1393 int dist)
1394{
1395 struct numa_group *ng = deref_task_numa_group(p);
1396 unsigned long faults, total_faults;
1397
1398 if (!ng)
1399 return 0;
1400
1401 total_faults = ng->total_faults;
1402
1403 if (!total_faults)
1404 return 0;
1405
1406 faults = group_faults(p, nid);
1407 faults += score_nearby_nodes(p, nid, dist, false);
1408
1409 return 1000 * faults / total_faults;
1410}
1411
1412bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1413 int src_nid, int dst_cpu)
1414{
1415 struct numa_group *ng = deref_curr_numa_group(p);
1416 int dst_nid = cpu_to_node(dst_cpu);
1417 int last_cpupid, this_cpupid;
1418
1419 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1420 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1421
1422
1423
1424
1425
1426
1427
1428 if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
1429 (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
1430 return true;
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449 if (!cpupid_pid_unset(last_cpupid) &&
1450 cpupid_to_nid(last_cpupid) != dst_nid)
1451 return false;
1452
1453
1454 if (cpupid_match_pid(p, last_cpupid))
1455 return true;
1456
1457
1458 if (!ng)
1459 return true;
1460
1461
1462
1463
1464
1465 if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1466 ACTIVE_NODE_FRACTION)
1467 return true;
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477 return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1478 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
1479}
1480
1481
1482
1483
1484enum numa_type {
1485
1486 node_has_spare = 0,
1487
1488
1489
1490
1491 node_fully_busy,
1492
1493
1494
1495
1496 node_overloaded
1497};
1498
1499
1500struct numa_stats {
1501 unsigned long load;
1502 unsigned long runnable;
1503 unsigned long util;
1504
1505 unsigned long compute_capacity;
1506 unsigned int nr_running;
1507 unsigned int weight;
1508 enum numa_type node_type;
1509 int idle_cpu;
1510};
1511
1512static inline bool is_core_idle(int cpu)
1513{
1514#ifdef CONFIG_SCHED_SMT
1515 int sibling;
1516
1517 for_each_cpu(sibling, cpu_smt_mask(cpu)) {
1518 if (cpu == sibling)
1519 continue;
1520
1521 if (!idle_cpu(sibling))
1522 return false;
1523 }
1524#endif
1525
1526 return true;
1527}
1528
1529struct task_numa_env {
1530 struct task_struct *p;
1531
1532 int src_cpu, src_nid;
1533 int dst_cpu, dst_nid;
1534
1535 struct numa_stats src_stats, dst_stats;
1536
1537 int imbalance_pct;
1538 int dist;
1539
1540 struct task_struct *best_task;
1541 long best_imp;
1542 int best_cpu;
1543};
1544
1545static unsigned long cpu_load(struct rq *rq);
1546static unsigned long cpu_runnable(struct rq *rq);
1547static unsigned long cpu_util(int cpu);
1548static inline long adjust_numa_imbalance(int imbalance,
1549 int dst_running, int dst_weight);
1550
1551static inline enum
1552numa_type numa_classify(unsigned int imbalance_pct,
1553 struct numa_stats *ns)
1554{
1555 if ((ns->nr_running > ns->weight) &&
1556 (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
1557 ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
1558 return node_overloaded;
1559
1560 if ((ns->nr_running < ns->weight) ||
1561 (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
1562 ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
1563 return node_has_spare;
1564
1565 return node_fully_busy;
1566}
1567
1568#ifdef CONFIG_SCHED_SMT
1569
1570static inline bool test_idle_cores(int cpu, bool def);
1571static inline int numa_idle_core(int idle_core, int cpu)
1572{
1573 if (!static_branch_likely(&sched_smt_present) ||
1574 idle_core >= 0 || !test_idle_cores(cpu, false))
1575 return idle_core;
1576
1577
1578
1579
1580
1581 if (is_core_idle(cpu))
1582 idle_core = cpu;
1583
1584 return idle_core;
1585}
1586#else
1587static inline int numa_idle_core(int idle_core, int cpu)
1588{
1589 return idle_core;
1590}
1591#endif
1592
1593
1594
1595
1596
1597
1598
1599static void update_numa_stats(struct task_numa_env *env,
1600 struct numa_stats *ns, int nid,
1601 bool find_idle)
1602{
1603 int cpu, idle_core = -1;
1604
1605 memset(ns, 0, sizeof(*ns));
1606 ns->idle_cpu = -1;
1607
1608 rcu_read_lock();
1609 for_each_cpu(cpu, cpumask_of_node(nid)) {
1610 struct rq *rq = cpu_rq(cpu);
1611
1612 ns->load += cpu_load(rq);
1613 ns->runnable += cpu_runnable(rq);
1614 ns->util += cpu_util(cpu);
1615 ns->nr_running += rq->cfs.h_nr_running;
1616 ns->compute_capacity += capacity_of(cpu);
1617
1618 if (find_idle && !rq->nr_running && idle_cpu(cpu)) {
1619 if (READ_ONCE(rq->numa_migrate_on) ||
1620 !cpumask_test_cpu(cpu, env->p->cpus_ptr))
1621 continue;
1622
1623 if (ns->idle_cpu == -1)
1624 ns->idle_cpu = cpu;
1625
1626 idle_core = numa_idle_core(idle_core, cpu);
1627 }
1628 }
1629 rcu_read_unlock();
1630
1631 ns->weight = cpumask_weight(cpumask_of_node(nid));
1632
1633 ns->node_type = numa_classify(env->imbalance_pct, ns);
1634
1635 if (idle_core >= 0)
1636 ns->idle_cpu = idle_core;
1637}
1638
1639static void task_numa_assign(struct task_numa_env *env,
1640 struct task_struct *p, long imp)
1641{
1642 struct rq *rq = cpu_rq(env->dst_cpu);
1643
1644
1645 if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) {
1646 int cpu;
1647 int start = env->dst_cpu;
1648
1649
1650 for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) {
1651 if (cpu == env->best_cpu || !idle_cpu(cpu) ||
1652 !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
1653 continue;
1654 }
1655
1656 env->dst_cpu = cpu;
1657 rq = cpu_rq(env->dst_cpu);
1658 if (!xchg(&rq->numa_migrate_on, 1))
1659 goto assign;
1660 }
1661
1662
1663 return;
1664 }
1665
1666assign:
1667
1668
1669
1670
1671 if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
1672 rq = cpu_rq(env->best_cpu);
1673 WRITE_ONCE(rq->numa_migrate_on, 0);
1674 }
1675
1676 if (env->best_task)
1677 put_task_struct(env->best_task);
1678 if (p)
1679 get_task_struct(p);
1680
1681 env->best_task = p;
1682 env->best_imp = imp;
1683 env->best_cpu = env->dst_cpu;
1684}
1685
1686static bool load_too_imbalanced(long src_load, long dst_load,
1687 struct task_numa_env *env)
1688{
1689 long imb, old_imb;
1690 long orig_src_load, orig_dst_load;
1691 long src_capacity, dst_capacity;
1692
1693
1694
1695
1696
1697
1698
1699
1700 src_capacity = env->src_stats.compute_capacity;
1701 dst_capacity = env->dst_stats.compute_capacity;
1702
1703 imb = abs(dst_load * src_capacity - src_load * dst_capacity);
1704
1705 orig_src_load = env->src_stats.load;
1706 orig_dst_load = env->dst_stats.load;
1707
1708 old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
1709
1710
1711 return (imb > old_imb);
1712}
1713
1714
1715
1716
1717
1718
1719#define SMALLIMP 30
1720
1721
1722
1723
1724
1725
1726
1727static bool task_numa_compare(struct task_numa_env *env,
1728 long taskimp, long groupimp, bool maymove)
1729{
1730 struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
1731 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1732 long imp = p_ng ? groupimp : taskimp;
1733 struct task_struct *cur;
1734 long src_load, dst_load;
1735 int dist = env->dist;
1736 long moveimp = imp;
1737 long load;
1738 bool stopsearch = false;
1739
1740 if (READ_ONCE(dst_rq->numa_migrate_on))
1741 return false;
1742
1743 rcu_read_lock();
1744 cur = rcu_dereference(dst_rq->curr);
1745 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
1746 cur = NULL;
1747
1748
1749
1750
1751
1752 if (cur == env->p) {
1753 stopsearch = true;
1754 goto unlock;
1755 }
1756
1757 if (!cur) {
1758 if (maymove && moveimp >= env->best_imp)
1759 goto assign;
1760 else
1761 goto unlock;
1762 }
1763
1764
1765 if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
1766 goto unlock;
1767
1768
1769
1770
1771
1772 if (env->best_task &&
1773 env->best_task->numa_preferred_nid == env->src_nid &&
1774 cur->numa_preferred_nid != env->src_nid) {
1775 goto unlock;
1776 }
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788 cur_ng = rcu_dereference(cur->numa_group);
1789 if (cur_ng == p_ng) {
1790 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1791 task_weight(cur, env->dst_nid, dist);
1792
1793
1794
1795
1796 if (cur_ng)
1797 imp -= imp / 16;
1798 } else {
1799
1800
1801
1802
1803 if (cur_ng && p_ng)
1804 imp += group_weight(cur, env->src_nid, dist) -
1805 group_weight(cur, env->dst_nid, dist);
1806 else
1807 imp += task_weight(cur, env->src_nid, dist) -
1808 task_weight(cur, env->dst_nid, dist);
1809 }
1810
1811
1812 if (cur->numa_preferred_nid == env->dst_nid)
1813 imp -= imp / 16;
1814
1815
1816
1817
1818
1819
1820
1821 if (cur->numa_preferred_nid == env->src_nid)
1822 imp += imp / 8;
1823
1824 if (maymove && moveimp > imp && moveimp > env->best_imp) {
1825 imp = moveimp;
1826 cur = NULL;
1827 goto assign;
1828 }
1829
1830
1831
1832
1833
1834 if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
1835 env->best_task->numa_preferred_nid != env->src_nid) {
1836 goto assign;
1837 }
1838
1839
1840
1841
1842
1843
1844
1845 if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
1846 goto unlock;
1847
1848
1849
1850
1851 load = task_h_load(env->p) - task_h_load(cur);
1852 if (!load)
1853 goto assign;
1854
1855 dst_load = env->dst_stats.load + load;
1856 src_load = env->src_stats.load - load;
1857
1858 if (load_too_imbalanced(src_load, dst_load, env))
1859 goto unlock;
1860
1861assign:
1862
1863 if (!cur) {
1864 int cpu = env->dst_stats.idle_cpu;
1865
1866
1867 if (cpu < 0)
1868 cpu = env->dst_cpu;
1869
1870
1871
1872
1873
1874 if (!idle_cpu(cpu) && env->best_cpu >= 0 &&
1875 idle_cpu(env->best_cpu)) {
1876 cpu = env->best_cpu;
1877 }
1878
1879 env->dst_cpu = cpu;
1880 }
1881
1882 task_numa_assign(env, cur, imp);
1883
1884
1885
1886
1887
1888
1889 if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu))
1890 stopsearch = true;
1891
1892
1893
1894
1895
1896 if (!maymove && env->best_task &&
1897 env->best_task->numa_preferred_nid == env->src_nid) {
1898 stopsearch = true;
1899 }
1900unlock:
1901 rcu_read_unlock();
1902
1903 return stopsearch;
1904}
1905
1906static void task_numa_find_cpu(struct task_numa_env *env,
1907 long taskimp, long groupimp)
1908{
1909 bool maymove = false;
1910 int cpu;
1911
1912
1913
1914
1915
1916 if (env->dst_stats.node_type == node_has_spare) {
1917 unsigned int imbalance;
1918 int src_running, dst_running;
1919
1920
1921
1922
1923
1924
1925
1926 src_running = env->src_stats.nr_running - 1;
1927 dst_running = env->dst_stats.nr_running + 1;
1928 imbalance = max(0, dst_running - src_running);
1929 imbalance = adjust_numa_imbalance(imbalance, dst_running,
1930 env->dst_stats.weight);
1931
1932
1933 if (!imbalance) {
1934 maymove = true;
1935 if (env->dst_stats.idle_cpu >= 0) {
1936 env->dst_cpu = env->dst_stats.idle_cpu;
1937 task_numa_assign(env, NULL, 0);
1938 return;
1939 }
1940 }
1941 } else {
1942 long src_load, dst_load, load;
1943
1944
1945
1946
1947 load = task_h_load(env->p);
1948 dst_load = env->dst_stats.load + load;
1949 src_load = env->src_stats.load - load;
1950 maymove = !load_too_imbalanced(src_load, dst_load, env);
1951 }
1952
1953 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1954
1955 if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
1956 continue;
1957
1958 env->dst_cpu = cpu;
1959 if (task_numa_compare(env, taskimp, groupimp, maymove))
1960 break;
1961 }
1962}
1963
1964static int task_numa_migrate(struct task_struct *p)
1965{
1966 struct task_numa_env env = {
1967 .p = p,
1968
1969 .src_cpu = task_cpu(p),
1970 .src_nid = task_node(p),
1971
1972 .imbalance_pct = 112,
1973
1974 .best_task = NULL,
1975 .best_imp = 0,
1976 .best_cpu = -1,
1977 };
1978 unsigned long taskweight, groupweight;
1979 struct sched_domain *sd;
1980 long taskimp, groupimp;
1981 struct numa_group *ng;
1982 struct rq *best_rq;
1983 int nid, ret, dist;
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993 rcu_read_lock();
1994 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1995 if (sd)
1996 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1997 rcu_read_unlock();
1998
1999
2000
2001
2002
2003
2004
2005 if (unlikely(!sd)) {
2006 sched_setnuma(p, task_node(p));
2007 return -EINVAL;
2008 }
2009
2010 env.dst_nid = p->numa_preferred_nid;
2011 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
2012 taskweight = task_weight(p, env.src_nid, dist);
2013 groupweight = group_weight(p, env.src_nid, dist);
2014 update_numa_stats(&env, &env.src_stats, env.src_nid, false);
2015 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
2016 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
2017 update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
2018
2019
2020 task_numa_find_cpu(&env, taskimp, groupimp);
2021
2022
2023
2024
2025
2026
2027
2028
2029 ng = deref_curr_numa_group(p);
2030 if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
2031 for_each_online_node(nid) {
2032 if (nid == env.src_nid || nid == p->numa_preferred_nid)
2033 continue;
2034
2035 dist = node_distance(env.src_nid, env.dst_nid);
2036 if (sched_numa_topology_type == NUMA_BACKPLANE &&
2037 dist != env.dist) {
2038 taskweight = task_weight(p, env.src_nid, dist);
2039 groupweight = group_weight(p, env.src_nid, dist);
2040 }
2041
2042
2043 taskimp = task_weight(p, nid, dist) - taskweight;
2044 groupimp = group_weight(p, nid, dist) - groupweight;
2045 if (taskimp < 0 && groupimp < 0)
2046 continue;
2047
2048 env.dist = dist;
2049 env.dst_nid = nid;
2050 update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
2051 task_numa_find_cpu(&env, taskimp, groupimp);
2052 }
2053 }
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063 if (ng) {
2064 if (env.best_cpu == -1)
2065 nid = env.src_nid;
2066 else
2067 nid = cpu_to_node(env.best_cpu);
2068
2069 if (nid != p->numa_preferred_nid)
2070 sched_setnuma(p, nid);
2071 }
2072
2073
2074 if (env.best_cpu == -1) {
2075 trace_sched_stick_numa(p, env.src_cpu, NULL, -1);
2076 return -EAGAIN;
2077 }
2078
2079 best_rq = cpu_rq(env.best_cpu);
2080 if (env.best_task == NULL) {
2081 ret = migrate_task_to(p, env.best_cpu);
2082 WRITE_ONCE(best_rq->numa_migrate_on, 0);
2083 if (ret != 0)
2084 trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu);
2085 return ret;
2086 }
2087
2088 ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
2089 WRITE_ONCE(best_rq->numa_migrate_on, 0);
2090
2091 if (ret != 0)
2092 trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu);
2093 put_task_struct(env.best_task);
2094 return ret;
2095}
2096
2097
2098static void numa_migrate_preferred(struct task_struct *p)
2099{
2100 unsigned long interval = HZ;
2101
2102
2103 if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
2104 return;
2105
2106
2107 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
2108 p->numa_migrate_retry = jiffies + interval;
2109
2110
2111 if (task_node(p) == p->numa_preferred_nid)
2112 return;
2113
2114
2115 task_numa_migrate(p);
2116}
2117
2118
2119
2120
2121
2122
2123
2124static void numa_group_count_active_nodes(struct numa_group *numa_group)
2125{
2126 unsigned long faults, max_faults = 0;
2127 int nid, active_nodes = 0;
2128
2129 for_each_online_node(nid) {
2130 faults = group_faults_cpu(numa_group, nid);
2131 if (faults > max_faults)
2132 max_faults = faults;
2133 }
2134
2135 for_each_online_node(nid) {
2136 faults = group_faults_cpu(numa_group, nid);
2137 if (faults * ACTIVE_NODE_FRACTION > max_faults)
2138 active_nodes++;
2139 }
2140
2141 numa_group->max_faults_cpu = max_faults;
2142 numa_group->active_nodes = active_nodes;
2143}
2144
2145
2146
2147
2148
2149
2150
2151
2152#define NUMA_PERIOD_SLOTS 10
2153#define NUMA_PERIOD_THRESHOLD 7
2154
2155
2156
2157
2158
2159
2160
2161static void update_task_scan_period(struct task_struct *p,
2162 unsigned long shared, unsigned long private)
2163{
2164 unsigned int period_slot;
2165 int lr_ratio, ps_ratio;
2166 int diff;
2167
2168 unsigned long remote = p->numa_faults_locality[0];
2169 unsigned long local = p->numa_faults_locality[1];
2170
2171
2172
2173
2174
2175
2176
2177
2178 if (local + shared == 0 || p->numa_faults_locality[2]) {
2179 p->numa_scan_period = min(p->numa_scan_period_max,
2180 p->numa_scan_period << 1);
2181
2182 p->mm->numa_next_scan = jiffies +
2183 msecs_to_jiffies(p->numa_scan_period);
2184
2185 return;
2186 }
2187
2188
2189
2190
2191
2192
2193
2194 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
2195 lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
2196 ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
2197
2198 if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
2199
2200
2201
2202
2203 int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
2204 if (!slot)
2205 slot = 1;
2206 diff = slot * period_slot;
2207 } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
2208
2209
2210
2211
2212
2213 int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
2214 if (!slot)
2215 slot = 1;
2216 diff = slot * period_slot;
2217 } else {
2218
2219
2220
2221
2222
2223 int ratio = max(lr_ratio, ps_ratio);
2224 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
2225 }
2226
2227 p->numa_scan_period = clamp(p->numa_scan_period + diff,
2228 task_scan_min(p), task_scan_max(p));
2229 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2230}
2231
2232
2233
2234
2235
2236
2237
2238
2239static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
2240{
2241 u64 runtime, delta, now;
2242
2243 now = p->se.exec_start;
2244 runtime = p->se.sum_exec_runtime;
2245
2246 if (p->last_task_numa_placement) {
2247 delta = runtime - p->last_sum_exec_runtime;
2248 *period = now - p->last_task_numa_placement;
2249
2250
2251 if (unlikely((s64)*period < 0))
2252 *period = 0;
2253 } else {
2254 delta = p->se.avg.load_sum;
2255 *period = LOAD_AVG_MAX;
2256 }
2257
2258 p->last_sum_exec_runtime = runtime;
2259 p->last_task_numa_placement = now;
2260
2261 return delta;
2262}
2263
2264
2265
2266
2267
2268
2269static int preferred_group_nid(struct task_struct *p, int nid)
2270{
2271 nodemask_t nodes;
2272 int dist;
2273
2274
2275 if (sched_numa_topology_type == NUMA_DIRECT)
2276 return nid;
2277
2278
2279
2280
2281
2282
2283 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
2284 unsigned long score, max_score = 0;
2285 int node, max_node = nid;
2286
2287 dist = sched_max_numa_distance;
2288
2289 for_each_online_node(node) {
2290 score = group_weight(p, node, dist);
2291 if (score > max_score) {
2292 max_score = score;
2293 max_node = node;
2294 }
2295 }
2296 return max_node;
2297 }
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308 nodes = node_online_map;
2309 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
2310 unsigned long max_faults = 0;
2311 nodemask_t max_group = NODE_MASK_NONE;
2312 int a, b;
2313
2314
2315 if (!find_numa_distance(dist))
2316 continue;
2317
2318 for_each_node_mask(a, nodes) {
2319 unsigned long faults = 0;
2320 nodemask_t this_group;
2321 nodes_clear(this_group);
2322
2323
2324 for_each_node_mask(b, nodes) {
2325 if (node_distance(a, b) < dist) {
2326 faults += group_faults(p, b);
2327 node_set(b, this_group);
2328 node_clear(b, nodes);
2329 }
2330 }
2331
2332
2333 if (faults > max_faults) {
2334 max_faults = faults;
2335 max_group = this_group;
2336
2337
2338
2339
2340
2341 nid = a;
2342 }
2343 }
2344
2345 if (!max_faults)
2346 break;
2347 nodes = max_group;
2348 }
2349 return nid;
2350}
2351
2352static void task_numa_placement(struct task_struct *p)
2353{
2354 int seq, nid, max_nid = NUMA_NO_NODE;
2355 unsigned long max_faults = 0;
2356 unsigned long fault_types[2] = { 0, 0 };
2357 unsigned long total_faults;
2358 u64 runtime, period;
2359 spinlock_t *group_lock = NULL;
2360 struct numa_group *ng;
2361
2362
2363
2364
2365
2366
2367 seq = READ_ONCE(p->mm->numa_scan_seq);
2368 if (p->numa_scan_seq == seq)
2369 return;
2370 p->numa_scan_seq = seq;
2371 p->numa_scan_period_max = task_scan_max(p);
2372
2373 total_faults = p->numa_faults_locality[0] +
2374 p->numa_faults_locality[1];
2375 runtime = numa_get_avg_runtime(p, &period);
2376
2377
2378 ng = deref_curr_numa_group(p);
2379 if (ng) {
2380 group_lock = &ng->lock;
2381 spin_lock_irq(group_lock);
2382 }
2383
2384
2385 for_each_online_node(nid) {
2386
2387 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
2388 unsigned long faults = 0, group_faults = 0;
2389 int priv;
2390
2391 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
2392 long diff, f_diff, f_weight;
2393
2394 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2395 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2396 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2397 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
2398
2399
2400 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2401 fault_types[priv] += p->numa_faults[membuf_idx];
2402 p->numa_faults[membuf_idx] = 0;
2403
2404
2405
2406
2407
2408
2409
2410
2411 f_weight = div64_u64(runtime << 16, period + 1);
2412 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
2413 (total_faults + 1);
2414 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2415 p->numa_faults[cpubuf_idx] = 0;
2416
2417 p->numa_faults[mem_idx] += diff;
2418 p->numa_faults[cpu_idx] += f_diff;
2419 faults += p->numa_faults[mem_idx];
2420 p->total_numa_faults += diff;
2421 if (ng) {
2422
2423
2424
2425
2426
2427
2428
2429 ng->faults[mem_idx] += diff;
2430 ng->faults_cpu[mem_idx] += f_diff;
2431 ng->total_faults += diff;
2432 group_faults += ng->faults[mem_idx];
2433 }
2434 }
2435
2436 if (!ng) {
2437 if (faults > max_faults) {
2438 max_faults = faults;
2439 max_nid = nid;
2440 }
2441 } else if (group_faults > max_faults) {
2442 max_faults = group_faults;
2443 max_nid = nid;
2444 }
2445 }
2446
2447 if (ng) {
2448 numa_group_count_active_nodes(ng);
2449 spin_unlock_irq(group_lock);
2450 max_nid = preferred_group_nid(p, max_nid);
2451 }
2452
2453 if (max_faults) {
2454
2455 if (max_nid != p->numa_preferred_nid)
2456 sched_setnuma(p, max_nid);
2457 }
2458
2459 update_task_scan_period(p, fault_types[0], fault_types[1]);
2460}
2461
2462static inline int get_numa_group(struct numa_group *grp)
2463{
2464 return refcount_inc_not_zero(&grp->refcount);
2465}
2466
2467static inline void put_numa_group(struct numa_group *grp)
2468{
2469 if (refcount_dec_and_test(&grp->refcount))
2470 kfree_rcu(grp, rcu);
2471}
2472
2473static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2474 int *priv)
2475{
2476 struct numa_group *grp, *my_grp;
2477 struct task_struct *tsk;
2478 bool join = false;
2479 int cpu = cpupid_to_cpu(cpupid);
2480 int i;
2481
2482 if (unlikely(!deref_curr_numa_group(p))) {
2483 unsigned int size = sizeof(struct numa_group) +
2484 4*nr_node_ids*sizeof(unsigned long);
2485
2486 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2487 if (!grp)
2488 return;
2489
2490 refcount_set(&grp->refcount, 1);
2491 grp->active_nodes = 1;
2492 grp->max_faults_cpu = 0;
2493 spin_lock_init(&grp->lock);
2494 grp->gid = p->pid;
2495
2496 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2497 nr_node_ids;
2498
2499 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2500 grp->faults[i] = p->numa_faults[i];
2501
2502 grp->total_faults = p->total_numa_faults;
2503
2504 grp->nr_tasks++;
2505 rcu_assign_pointer(p->numa_group, grp);
2506 }
2507
2508 rcu_read_lock();
2509 tsk = READ_ONCE(cpu_rq(cpu)->curr);
2510
2511 if (!cpupid_match_pid(tsk, cpupid))
2512 goto no_join;
2513
2514 grp = rcu_dereference(tsk->numa_group);
2515 if (!grp)
2516 goto no_join;
2517
2518 my_grp = deref_curr_numa_group(p);
2519 if (grp == my_grp)
2520 goto no_join;
2521
2522
2523
2524
2525
2526 if (my_grp->nr_tasks > grp->nr_tasks)
2527 goto no_join;
2528
2529
2530
2531
2532 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2533 goto no_join;
2534
2535
2536 if (tsk->mm == current->mm)
2537 join = true;
2538
2539
2540 if (flags & TNF_SHARED)
2541 join = true;
2542
2543
2544 *priv = !join;
2545
2546 if (join && !get_numa_group(grp))
2547 goto no_join;
2548
2549 rcu_read_unlock();
2550
2551 if (!join)
2552 return;
2553
2554 BUG_ON(irqs_disabled());
2555 double_lock_irq(&my_grp->lock, &grp->lock);
2556
2557 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2558 my_grp->faults[i] -= p->numa_faults[i];
2559 grp->faults[i] += p->numa_faults[i];
2560 }
2561 my_grp->total_faults -= p->total_numa_faults;
2562 grp->total_faults += p->total_numa_faults;
2563
2564 my_grp->nr_tasks--;
2565 grp->nr_tasks++;
2566
2567 spin_unlock(&my_grp->lock);
2568 spin_unlock_irq(&grp->lock);
2569
2570 rcu_assign_pointer(p->numa_group, grp);
2571
2572 put_numa_group(my_grp);
2573 return;
2574
2575no_join:
2576 rcu_read_unlock();
2577 return;
2578}
2579
2580
2581
2582
2583
2584
2585
2586
2587void task_numa_free(struct task_struct *p, bool final)
2588{
2589
2590 struct numa_group *grp = rcu_dereference_raw(p->numa_group);
2591 unsigned long *numa_faults = p->numa_faults;
2592 unsigned long flags;
2593 int i;
2594
2595 if (!numa_faults)
2596 return;
2597
2598 if (grp) {
2599 spin_lock_irqsave(&grp->lock, flags);
2600 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2601 grp->faults[i] -= p->numa_faults[i];
2602 grp->total_faults -= p->total_numa_faults;
2603
2604 grp->nr_tasks--;
2605 spin_unlock_irqrestore(&grp->lock, flags);
2606 RCU_INIT_POINTER(p->numa_group, NULL);
2607 put_numa_group(grp);
2608 }
2609
2610 if (final) {
2611 p->numa_faults = NULL;
2612 kfree(numa_faults);
2613 } else {
2614 p->total_numa_faults = 0;
2615 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2616 numa_faults[i] = 0;
2617 }
2618}
2619
2620
2621
2622
2623void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2624{
2625 struct task_struct *p = current;
2626 bool migrated = flags & TNF_MIGRATED;
2627 int cpu_node = task_node(current);
2628 int local = !!(flags & TNF_FAULT_LOCAL);
2629 struct numa_group *ng;
2630 int priv;
2631
2632 if (!static_branch_likely(&sched_numa_balancing))
2633 return;
2634
2635
2636 if (!p->mm)
2637 return;
2638
2639
2640 if (unlikely(!p->numa_faults)) {
2641 int size = sizeof(*p->numa_faults) *
2642 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2643
2644 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2645 if (!p->numa_faults)
2646 return;
2647
2648 p->total_numa_faults = 0;
2649 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2650 }
2651
2652
2653
2654
2655
2656 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2657 priv = 1;
2658 } else {
2659 priv = cpupid_match_pid(p, last_cpupid);
2660 if (!priv && !(flags & TNF_NO_GROUP))
2661 task_numa_group(p, last_cpupid, flags, &priv);
2662 }
2663
2664
2665
2666
2667
2668
2669
2670 ng = deref_curr_numa_group(p);
2671 if (!priv && !local && ng && ng->active_nodes > 1 &&
2672 numa_is_active_node(cpu_node, ng) &&
2673 numa_is_active_node(mem_node, ng))
2674 local = 1;
2675
2676
2677
2678
2679
2680 if (time_after(jiffies, p->numa_migrate_retry)) {
2681 task_numa_placement(p);
2682 numa_migrate_preferred(p);
2683 }
2684
2685 if (migrated)
2686 p->numa_pages_migrated += pages;
2687 if (flags & TNF_MIGRATE_FAIL)
2688 p->numa_faults_locality[2] += pages;
2689
2690 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2691 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2692 p->numa_faults_locality[local] += pages;
2693}
2694
2695static void reset_ptenuma_scan(struct task_struct *p)
2696{
2697
2698
2699
2700
2701
2702
2703
2704
2705 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2706 p->mm->numa_scan_offset = 0;
2707}
2708
2709
2710
2711
2712
2713static void task_numa_work(struct callback_head *work)
2714{
2715 unsigned long migrate, next_scan, now = jiffies;
2716 struct task_struct *p = current;
2717 struct mm_struct *mm = p->mm;
2718 u64 runtime = p->se.sum_exec_runtime;
2719 struct vm_area_struct *vma;
2720 unsigned long start, end;
2721 unsigned long nr_pte_updates = 0;
2722 long pages, virtpages;
2723
2724 SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
2725
2726 work->next = work;
2727
2728
2729
2730
2731
2732
2733
2734
2735 if (p->flags & PF_EXITING)
2736 return;
2737
2738 if (!mm->numa_next_scan) {
2739 mm->numa_next_scan = now +
2740 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2741 }
2742
2743
2744
2745
2746 migrate = mm->numa_next_scan;
2747 if (time_before(now, migrate))
2748 return;
2749
2750 if (p->numa_scan_period == 0) {
2751 p->numa_scan_period_max = task_scan_max(p);
2752 p->numa_scan_period = task_scan_start(p);
2753 }
2754
2755 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2756 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2757 return;
2758
2759
2760
2761
2762
2763 p->node_stamp += 2 * TICK_NSEC;
2764
2765 start = mm->numa_scan_offset;
2766 pages = sysctl_numa_balancing_scan_size;
2767 pages <<= 20 - PAGE_SHIFT;
2768 virtpages = pages * 8;
2769 if (!pages)
2770 return;
2771
2772
2773 if (!mmap_read_trylock(mm))
2774 return;
2775 vma = find_vma(mm, start);
2776 if (!vma) {
2777 reset_ptenuma_scan(p);
2778 start = 0;
2779 vma = mm->mmap;
2780 }
2781 for (; vma; vma = vma->vm_next) {
2782 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2783 is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2784 continue;
2785 }
2786
2787
2788
2789
2790
2791
2792
2793 if (!vma->vm_mm ||
2794 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2795 continue;
2796
2797
2798
2799
2800
2801 if (!vma_is_accessible(vma))
2802 continue;
2803
2804 do {
2805 start = max(start, vma->vm_start);
2806 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2807 end = min(end, vma->vm_end);
2808 nr_pte_updates = change_prot_numa(vma, start, end);
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818 if (nr_pte_updates)
2819 pages -= (end - start) >> PAGE_SHIFT;
2820 virtpages -= (end - start) >> PAGE_SHIFT;
2821
2822 start = end;
2823 if (pages <= 0 || virtpages <= 0)
2824 goto out;
2825
2826 cond_resched();
2827 } while (end != vma->vm_end);
2828 }
2829
2830out:
2831
2832
2833
2834
2835
2836
2837 if (vma)
2838 mm->numa_scan_offset = start;
2839 else
2840 reset_ptenuma_scan(p);
2841 mmap_read_unlock(mm);
2842
2843
2844
2845
2846
2847
2848
2849 if (unlikely(p->se.sum_exec_runtime != runtime)) {
2850 u64 diff = p->se.sum_exec_runtime - runtime;
2851 p->node_stamp += 32 * diff;
2852 }
2853}
2854
2855void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
2856{
2857 int mm_users = 0;
2858 struct mm_struct *mm = p->mm;
2859
2860 if (mm) {
2861 mm_users = atomic_read(&mm->mm_users);
2862 if (mm_users == 1) {
2863 mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2864 mm->numa_scan_seq = 0;
2865 }
2866 }
2867 p->node_stamp = 0;
2868 p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
2869 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
2870
2871 p->numa_work.next = &p->numa_work;
2872 p->numa_faults = NULL;
2873 RCU_INIT_POINTER(p->numa_group, NULL);
2874 p->last_task_numa_placement = 0;
2875 p->last_sum_exec_runtime = 0;
2876
2877 init_task_work(&p->numa_work, task_numa_work);
2878
2879
2880 if (!(clone_flags & CLONE_VM)) {
2881 p->numa_preferred_nid = NUMA_NO_NODE;
2882 return;
2883 }
2884
2885
2886
2887
2888
2889 if (mm) {
2890 unsigned int delay;
2891
2892 delay = min_t(unsigned int, task_scan_max(current),
2893 current->numa_scan_period * mm_users * NSEC_PER_MSEC);
2894 delay += 2 * TICK_NSEC;
2895 p->node_stamp = delay;
2896 }
2897}
2898
2899
2900
2901
2902static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2903{
2904 struct callback_head *work = &curr->numa_work;
2905 u64 period, now;
2906
2907
2908
2909
2910 if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
2911 return;
2912
2913
2914
2915
2916
2917
2918
2919 now = curr->se.sum_exec_runtime;
2920 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2921
2922 if (now > curr->node_stamp + period) {
2923 if (!curr->node_stamp)
2924 curr->numa_scan_period = task_scan_start(curr);
2925 curr->node_stamp += period;
2926
2927 if (!time_before(jiffies, curr->mm->numa_next_scan))
2928 task_work_add(curr, work, TWA_RESUME);
2929 }
2930}
2931
2932static void update_scan_period(struct task_struct *p, int new_cpu)
2933{
2934 int src_nid = cpu_to_node(task_cpu(p));
2935 int dst_nid = cpu_to_node(new_cpu);
2936
2937 if (!static_branch_likely(&sched_numa_balancing))
2938 return;
2939
2940 if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
2941 return;
2942
2943 if (src_nid == dst_nid)
2944 return;
2945
2946
2947
2948
2949
2950
2951 if (p->numa_scan_seq) {
2952
2953
2954
2955
2956
2957 if (dst_nid == p->numa_preferred_nid ||
2958 (p->numa_preferred_nid != NUMA_NO_NODE &&
2959 src_nid != p->numa_preferred_nid))
2960 return;
2961 }
2962
2963 p->numa_scan_period = task_scan_start(p);
2964}
2965
2966#else
2967static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2968{
2969}
2970
2971static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2972{
2973}
2974
2975static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2976{
2977}
2978
2979static inline void update_scan_period(struct task_struct *p, int new_cpu)
2980{
2981}
2982
2983#endif
2984
2985static void
2986account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2987{
2988 update_load_add(&cfs_rq->load, se->load.weight);
2989#ifdef CONFIG_SMP
2990 if (entity_is_task(se)) {
2991 struct rq *rq = rq_of(cfs_rq);
2992
2993 account_numa_enqueue(rq, task_of(se));
2994 list_add(&se->group_node, &rq->cfs_tasks);
2995 }
2996#endif
2997 cfs_rq->nr_running++;
2998}
2999
3000static void
3001account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
3002{
3003 update_load_sub(&cfs_rq->load, se->load.weight);
3004#ifdef CONFIG_SMP
3005 if (entity_is_task(se)) {
3006 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
3007 list_del_init(&se->group_node);
3008 }
3009#endif
3010 cfs_rq->nr_running--;
3011}
3012
3013
3014
3015
3016
3017
3018
3019
3020#define add_positive(_ptr, _val) do { \
3021 typeof(_ptr) ptr = (_ptr); \
3022 typeof(_val) val = (_val); \
3023 typeof(*ptr) res, var = READ_ONCE(*ptr); \
3024 \
3025 res = var + val; \
3026 \
3027 if (val < 0 && res > var) \
3028 res = 0; \
3029 \
3030 WRITE_ONCE(*ptr, res); \
3031} while (0)
3032
3033
3034
3035
3036
3037
3038
3039
3040#define sub_positive(_ptr, _val) do { \
3041 typeof(_ptr) ptr = (_ptr); \
3042 typeof(*ptr) val = (_val); \
3043 typeof(*ptr) res, var = READ_ONCE(*ptr); \
3044 res = var - val; \
3045 if (res > var) \
3046 res = 0; \
3047 WRITE_ONCE(*ptr, res); \
3048} while (0)
3049
3050
3051
3052
3053
3054
3055
3056#define lsub_positive(_ptr, _val) do { \
3057 typeof(_ptr) ptr = (_ptr); \
3058 *ptr -= min_t(typeof(*ptr), *ptr, _val); \
3059} while (0)
3060
3061#ifdef CONFIG_SMP
3062static inline void
3063enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3064{
3065 cfs_rq->avg.load_avg += se->avg.load_avg;
3066 cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
3067}
3068
3069static inline void
3070dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3071{
3072 u32 divider = get_pelt_divider(&se->avg);
3073 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
3074 cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
3075}
3076#else
3077static inline void
3078enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
3079static inline void
3080dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
3081#endif
3082
3083static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
3084 unsigned long weight)
3085{
3086 if (se->on_rq) {
3087
3088 if (cfs_rq->curr == se)
3089 update_curr(cfs_rq);
3090 update_load_sub(&cfs_rq->load, se->load.weight);
3091 }
3092 dequeue_load_avg(cfs_rq, se);
3093
3094 update_load_set(&se->load, weight);
3095
3096#ifdef CONFIG_SMP
3097 do {
3098 u32 divider = get_pelt_divider(&se->avg);
3099
3100 se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
3101 } while (0);
3102#endif
3103
3104 enqueue_load_avg(cfs_rq, se);
3105 if (se->on_rq)
3106 update_load_add(&cfs_rq->load, se->load.weight);
3107
3108}
3109
3110void reweight_task(struct task_struct *p, int prio)
3111{
3112 struct sched_entity *se = &p->se;
3113 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3114 struct load_weight *load = &se->load;
3115 unsigned long weight = scale_load(sched_prio_to_weight[prio]);
3116
3117 reweight_entity(cfs_rq, se, weight);
3118 load->inv_weight = sched_prio_to_wmult[prio];
3119}
3120
3121#ifdef CONFIG_FAIR_GROUP_SCHED
3122#ifdef CONFIG_SMP
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196static long calc_group_shares(struct cfs_rq *cfs_rq)
3197{
3198 long tg_weight, tg_shares, load, shares;
3199 struct task_group *tg = cfs_rq->tg;
3200
3201 tg_shares = READ_ONCE(tg->shares);
3202
3203 load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
3204
3205 tg_weight = atomic_long_read(&tg->load_avg);
3206
3207
3208 tg_weight -= cfs_rq->tg_load_avg_contrib;
3209 tg_weight += load;
3210
3211 shares = (tg_shares * load);
3212 if (tg_weight)
3213 shares /= tg_weight;
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227 return clamp_t(long, shares, MIN_SHARES, tg_shares);
3228}
3229#endif
3230
3231static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
3232
3233
3234
3235
3236
3237static void update_cfs_group(struct sched_entity *se)
3238{
3239 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3240 long shares;
3241
3242 if (!gcfs_rq)
3243 return;
3244
3245 if (throttled_hierarchy(gcfs_rq))
3246 return;
3247
3248#ifndef CONFIG_SMP
3249 shares = READ_ONCE(gcfs_rq->tg->shares);
3250
3251 if (likely(se->load.weight == shares))
3252 return;
3253#else
3254 shares = calc_group_shares(gcfs_rq);
3255#endif
3256
3257 reweight_entity(cfs_rq_of(se), se, shares);
3258}
3259
3260#else
3261static inline void update_cfs_group(struct sched_entity *se)
3262{
3263}
3264#endif
3265
3266static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
3267{
3268 struct rq *rq = rq_of(cfs_rq);
3269
3270 if (&rq->cfs == cfs_rq) {
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285 cpufreq_update_util(rq, flags);
3286 }
3287}
3288
3289#ifdef CONFIG_SMP
3290#ifdef CONFIG_FAIR_GROUP_SCHED
3291
3292
3293
3294
3295
3296
3297
3298
3299static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq)
3300{
3301 struct cfs_rq *prev_cfs_rq;
3302 struct list_head *prev;
3303
3304 if (cfs_rq->on_list) {
3305 prev = cfs_rq->leaf_cfs_rq_list.prev;
3306 } else {
3307 struct rq *rq = rq_of(cfs_rq);
3308
3309 prev = rq->tmp_alone_branch;
3310 }
3311
3312 prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list);
3313
3314 return (prev_cfs_rq->tg->parent == cfs_rq->tg);
3315}
3316
3317static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
3318{
3319 if (cfs_rq->load.weight)
3320 return false;
3321
3322 if (cfs_rq->avg.load_sum)
3323 return false;
3324
3325 if (cfs_rq->avg.util_sum)
3326 return false;
3327
3328 if (cfs_rq->avg.runnable_sum)
3329 return false;
3330
3331 if (child_cfs_rq_on_list(cfs_rq))
3332 return false;
3333
3334
3335
3336
3337
3338
3339 SCHED_WARN_ON(cfs_rq->avg.load_avg ||
3340 cfs_rq->avg.util_avg ||
3341 cfs_rq->avg.runnable_avg);
3342
3343 return true;
3344}
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
3361{
3362 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
3363
3364
3365
3366
3367 if (cfs_rq->tg == &root_task_group)
3368 return;
3369
3370 if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
3371 atomic_long_add(delta, &cfs_rq->tg->load_avg);
3372 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
3373 }
3374}
3375
3376
3377
3378
3379
3380
3381void set_task_rq_fair(struct sched_entity *se,
3382 struct cfs_rq *prev, struct cfs_rq *next)
3383{
3384 u64 p_last_update_time;
3385 u64 n_last_update_time;
3386
3387 if (!sched_feat(ATTACH_AGE_LOAD))
3388 return;
3389
3390
3391
3392
3393
3394
3395
3396
3397 if (!(se->avg.last_update_time && prev))
3398 return;
3399
3400#ifndef CONFIG_64BIT
3401 {
3402 u64 p_last_update_time_copy;
3403 u64 n_last_update_time_copy;
3404
3405 do {
3406 p_last_update_time_copy = prev->load_last_update_time_copy;
3407 n_last_update_time_copy = next->load_last_update_time_copy;
3408
3409 smp_rmb();
3410
3411 p_last_update_time = prev->avg.last_update_time;
3412 n_last_update_time = next->avg.last_update_time;
3413
3414 } while (p_last_update_time != p_last_update_time_copy ||
3415 n_last_update_time != n_last_update_time_copy);
3416 }
3417#else
3418 p_last_update_time = prev->avg.last_update_time;
3419 n_last_update_time = next->avg.last_update_time;
3420#endif
3421 __update_load_avg_blocked_se(p_last_update_time, se);
3422 se->avg.last_update_time = n_last_update_time;
3423}
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494static inline void
3495update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3496{
3497 long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
3498 u32 divider;
3499
3500
3501 if (!delta)
3502 return;
3503
3504
3505
3506
3507
3508 divider = get_pelt_divider(&cfs_rq->avg);
3509
3510
3511 se->avg.util_avg = gcfs_rq->avg.util_avg;
3512 se->avg.util_sum = se->avg.util_avg * divider;
3513
3514
3515 add_positive(&cfs_rq->avg.util_avg, delta);
3516 cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
3517}
3518
3519static inline void
3520update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3521{
3522 long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
3523 u32 divider;
3524
3525
3526 if (!delta)
3527 return;
3528
3529
3530
3531
3532
3533 divider = get_pelt_divider(&cfs_rq->avg);
3534
3535
3536 se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
3537 se->avg.runnable_sum = se->avg.runnable_avg * divider;
3538
3539
3540 add_positive(&cfs_rq->avg.runnable_avg, delta);
3541 cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
3542}
3543
3544static inline void
3545update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3546{
3547 long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
3548 unsigned long load_avg;
3549 u64 load_sum = 0;
3550 u32 divider;
3551
3552 if (!runnable_sum)
3553 return;
3554
3555 gcfs_rq->prop_runnable_sum = 0;
3556
3557
3558
3559
3560
3561 divider = get_pelt_divider(&cfs_rq->avg);
3562
3563 if (runnable_sum >= 0) {
3564
3565
3566
3567
3568 runnable_sum += se->avg.load_sum;
3569 runnable_sum = min_t(long, runnable_sum, divider);
3570 } else {
3571
3572
3573
3574
3575 if (scale_load_down(gcfs_rq->load.weight)) {
3576 load_sum = div_s64(gcfs_rq->avg.load_sum,
3577 scale_load_down(gcfs_rq->load.weight));
3578 }
3579
3580
3581 runnable_sum = min(se->avg.load_sum, load_sum);
3582 }
3583
3584
3585
3586
3587
3588
3589
3590 running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
3591 runnable_sum = max(runnable_sum, running_sum);
3592
3593 load_sum = (s64)se_weight(se) * runnable_sum;
3594 load_avg = div_s64(load_sum, divider);
3595
3596 se->avg.load_sum = runnable_sum;
3597
3598 delta = load_avg - se->avg.load_avg;
3599 if (!delta)
3600 return;
3601
3602 se->avg.load_avg = load_avg;
3603
3604 add_positive(&cfs_rq->avg.load_avg, delta);
3605 cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
3606}
3607
3608static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
3609{
3610 cfs_rq->propagate = 1;
3611 cfs_rq->prop_runnable_sum += runnable_sum;
3612}
3613
3614
3615static inline int propagate_entity_load_avg(struct sched_entity *se)
3616{
3617 struct cfs_rq *cfs_rq, *gcfs_rq;
3618
3619 if (entity_is_task(se))
3620 return 0;
3621
3622 gcfs_rq = group_cfs_rq(se);
3623 if (!gcfs_rq->propagate)
3624 return 0;
3625
3626 gcfs_rq->propagate = 0;
3627
3628 cfs_rq = cfs_rq_of(se);
3629
3630 add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
3631
3632 update_tg_cfs_util(cfs_rq, se, gcfs_rq);
3633 update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
3634 update_tg_cfs_load(cfs_rq, se, gcfs_rq);
3635
3636 trace_pelt_cfs_tp(cfs_rq);
3637 trace_pelt_se_tp(se);
3638
3639 return 1;
3640}
3641
3642
3643
3644
3645
3646static inline bool skip_blocked_update(struct sched_entity *se)
3647{
3648 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3649
3650
3651
3652
3653
3654 if (se->avg.load_avg || se->avg.util_avg)
3655 return false;
3656
3657
3658
3659
3660
3661 if (gcfs_rq->propagate)
3662 return false;
3663
3664
3665
3666
3667
3668
3669 return true;
3670}
3671
3672#else
3673
3674static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
3675
3676static inline int propagate_entity_load_avg(struct sched_entity *se)
3677{
3678 return 0;
3679}
3680
3681static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
3682
3683#endif
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701static inline int
3702update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3703{
3704 unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
3705 struct sched_avg *sa = &cfs_rq->avg;
3706 int decayed = 0;
3707
3708 if (cfs_rq->removed.nr) {
3709 unsigned long r;
3710 u32 divider = get_pelt_divider(&cfs_rq->avg);
3711
3712 raw_spin_lock(&cfs_rq->removed.lock);
3713 swap(cfs_rq->removed.util_avg, removed_util);
3714 swap(cfs_rq->removed.load_avg, removed_load);
3715 swap(cfs_rq->removed.runnable_avg, removed_runnable);
3716 cfs_rq->removed.nr = 0;
3717 raw_spin_unlock(&cfs_rq->removed.lock);
3718
3719 r = removed_load;
3720 sub_positive(&sa->load_avg, r);
3721 sa->load_sum = sa->load_avg * divider;
3722
3723 r = removed_util;
3724 sub_positive(&sa->util_avg, r);
3725 sa->util_sum = sa->util_avg * divider;
3726
3727 r = removed_runnable;
3728 sub_positive(&sa->runnable_avg, r);
3729 sa->runnable_sum = sa->runnable_avg * divider;
3730
3731
3732
3733
3734
3735 add_tg_cfs_propagate(cfs_rq,
3736 -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
3737
3738 decayed = 1;
3739 }
3740
3741 decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
3742
3743#ifndef CONFIG_64BIT
3744 smp_wmb();
3745 cfs_rq->load_last_update_time_copy = sa->last_update_time;
3746#endif
3747
3748 return decayed;
3749}
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3760{
3761
3762
3763
3764
3765 u32 divider = get_pelt_divider(&cfs_rq->avg);
3766
3767
3768
3769
3770
3771
3772
3773
3774 se->avg.last_update_time = cfs_rq->avg.last_update_time;
3775 se->avg.period_contrib = cfs_rq->avg.period_contrib;
3776
3777
3778
3779
3780
3781
3782
3783 se->avg.util_sum = se->avg.util_avg * divider;
3784
3785 se->avg.runnable_sum = se->avg.runnable_avg * divider;
3786
3787 se->avg.load_sum = divider;
3788 if (se_weight(se)) {
3789 se->avg.load_sum =
3790 div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
3791 }
3792
3793 enqueue_load_avg(cfs_rq, se);
3794 cfs_rq->avg.util_avg += se->avg.util_avg;
3795 cfs_rq->avg.util_sum += se->avg.util_sum;
3796 cfs_rq->avg.runnable_avg += se->avg.runnable_avg;
3797 cfs_rq->avg.runnable_sum += se->avg.runnable_sum;
3798
3799 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
3800
3801 cfs_rq_util_change(cfs_rq, 0);
3802
3803 trace_pelt_cfs_tp(cfs_rq);
3804}
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3815{
3816
3817
3818
3819
3820 u32 divider = get_pelt_divider(&cfs_rq->avg);
3821
3822 dequeue_load_avg(cfs_rq, se);
3823 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3824 cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
3825 sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
3826 cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
3827
3828 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
3829
3830 cfs_rq_util_change(cfs_rq, 0);
3831
3832 trace_pelt_cfs_tp(cfs_rq);
3833}
3834
3835
3836
3837
3838#define UPDATE_TG 0x1
3839#define SKIP_AGE_LOAD 0x2
3840#define DO_ATTACH 0x4
3841
3842
3843static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3844{
3845 u64 now = cfs_rq_clock_pelt(cfs_rq);
3846 int decayed;
3847
3848
3849
3850
3851
3852 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
3853 __update_load_avg_se(now, cfs_rq, se);
3854
3855 decayed = update_cfs_rq_load_avg(now, cfs_rq);
3856 decayed |= propagate_entity_load_avg(se);
3857
3858 if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
3859
3860
3861
3862
3863
3864
3865
3866
3867 attach_entity_load_avg(cfs_rq, se);
3868 update_tg_load_avg(cfs_rq);
3869
3870 } else if (decayed) {
3871 cfs_rq_util_change(cfs_rq, 0);
3872
3873 if (flags & UPDATE_TG)
3874 update_tg_load_avg(cfs_rq);
3875 }
3876}
3877
3878#ifndef CONFIG_64BIT
3879static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3880{
3881 u64 last_update_time_copy;
3882 u64 last_update_time;
3883
3884 do {
3885 last_update_time_copy = cfs_rq->load_last_update_time_copy;
3886 smp_rmb();
3887 last_update_time = cfs_rq->avg.last_update_time;
3888 } while (last_update_time != last_update_time_copy);
3889
3890 return last_update_time;
3891}
3892#else
3893static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3894{
3895 return cfs_rq->avg.last_update_time;
3896}
3897#endif
3898
3899
3900
3901
3902
3903static void sync_entity_load_avg(struct sched_entity *se)
3904{
3905 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3906 u64 last_update_time;
3907
3908 last_update_time = cfs_rq_last_update_time(cfs_rq);
3909 __update_load_avg_blocked_se(last_update_time, se);
3910}
3911
3912
3913
3914
3915
3916static void remove_entity_load_avg(struct sched_entity *se)
3917{
3918 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3919 unsigned long flags;
3920
3921
3922
3923
3924
3925
3926
3927 sync_entity_load_avg(se);
3928
3929 raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
3930 ++cfs_rq->removed.nr;
3931 cfs_rq->removed.util_avg += se->avg.util_avg;
3932 cfs_rq->removed.load_avg += se->avg.load_avg;
3933 cfs_rq->removed.runnable_avg += se->avg.runnable_avg;
3934 raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
3935}
3936
3937static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
3938{
3939 return cfs_rq->avg.runnable_avg;
3940}
3941
3942static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3943{
3944 return cfs_rq->avg.load_avg;
3945}
3946
3947static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
3948
3949static inline unsigned long task_util(struct task_struct *p)
3950{
3951 return READ_ONCE(p->se.avg.util_avg);
3952}
3953
3954static inline unsigned long _task_util_est(struct task_struct *p)
3955{
3956 struct util_est ue = READ_ONCE(p->se.avg.util_est);
3957
3958 return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
3959}
3960
3961static inline unsigned long task_util_est(struct task_struct *p)
3962{
3963 return max(task_util(p), _task_util_est(p));
3964}
3965
3966#ifdef CONFIG_UCLAMP_TASK
3967static inline unsigned long uclamp_task_util(struct task_struct *p)
3968{
3969 return clamp(task_util_est(p),
3970 uclamp_eff_value(p, UCLAMP_MIN),
3971 uclamp_eff_value(p, UCLAMP_MAX));
3972}
3973#else
3974static inline unsigned long uclamp_task_util(struct task_struct *p)
3975{
3976 return task_util_est(p);
3977}
3978#endif
3979
3980static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
3981 struct task_struct *p)
3982{
3983 unsigned int enqueued;
3984
3985 if (!sched_feat(UTIL_EST))
3986 return;
3987
3988
3989 enqueued = cfs_rq->avg.util_est.enqueued;
3990 enqueued += _task_util_est(p);
3991 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3992
3993 trace_sched_util_est_cfs_tp(cfs_rq);
3994}
3995
3996static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
3997 struct task_struct *p)
3998{
3999 unsigned int enqueued;
4000
4001 if (!sched_feat(UTIL_EST))
4002 return;
4003
4004
4005 enqueued = cfs_rq->avg.util_est.enqueued;
4006 enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
4007 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
4008
4009 trace_sched_util_est_cfs_tp(cfs_rq);
4010}
4011
4012#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022static inline bool within_margin(int value, int margin)
4023{
4024 return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
4025}
4026
4027static inline void util_est_update(struct cfs_rq *cfs_rq,
4028 struct task_struct *p,
4029 bool task_sleep)
4030{
4031 long last_ewma_diff, last_enqueued_diff;
4032 struct util_est ue;
4033
4034 if (!sched_feat(UTIL_EST))
4035 return;
4036
4037
4038
4039
4040
4041 if (!task_sleep)
4042 return;
4043
4044
4045
4046
4047
4048 ue = p->se.avg.util_est;
4049 if (ue.enqueued & UTIL_AVG_UNCHANGED)
4050 return;
4051
4052 last_enqueued_diff = ue.enqueued;
4053
4054
4055
4056
4057
4058 ue.enqueued = task_util(p);
4059 if (sched_feat(UTIL_EST_FASTUP)) {
4060 if (ue.ewma < ue.enqueued) {
4061 ue.ewma = ue.enqueued;
4062 goto done;
4063 }
4064 }
4065
4066
4067
4068
4069
4070 last_ewma_diff = ue.enqueued - ue.ewma;
4071 last_enqueued_diff -= ue.enqueued;
4072 if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) {
4073 if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
4074 goto done;
4075
4076 return;
4077 }
4078
4079
4080
4081
4082
4083 if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
4084 return;
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103 ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
4104 ue.ewma += last_ewma_diff;
4105 ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
4106done:
4107 ue.enqueued |= UTIL_AVG_UNCHANGED;
4108 WRITE_ONCE(p->se.avg.util_est, ue);
4109
4110 trace_sched_util_est_se_tp(&p->se);
4111}
4112
4113static inline int task_fits_capacity(struct task_struct *p, long capacity)
4114{
4115 return fits_capacity(uclamp_task_util(p), capacity);
4116}
4117
4118static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
4119{
4120 if (!static_branch_unlikely(&sched_asym_cpucapacity))
4121 return;
4122
4123 if (!p || p->nr_cpus_allowed == 1) {
4124 rq->misfit_task_load = 0;
4125 return;
4126 }
4127
4128 if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
4129 rq->misfit_task_load = 0;
4130 return;
4131 }
4132
4133
4134
4135
4136
4137 rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
4138}
4139
4140#else
4141
4142static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
4143{
4144 return true;
4145}
4146
4147#define UPDATE_TG 0x0
4148#define SKIP_AGE_LOAD 0x0
4149#define DO_ATTACH 0x0
4150
4151static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
4152{
4153 cfs_rq_util_change(cfs_rq, 0);
4154}
4155
4156static inline void remove_entity_load_avg(struct sched_entity *se) {}
4157
4158static inline void
4159attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4160static inline void
4161detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4162
4163static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
4164{
4165 return 0;
4166}
4167
4168static inline void
4169util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
4170
4171static inline void
4172util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
4173
4174static inline void
4175util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
4176 bool task_sleep) {}
4177static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
4178
4179#endif
4180
4181static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
4182{
4183#ifdef CONFIG_SCHED_DEBUG
4184 s64 d = se->vruntime - cfs_rq->min_vruntime;
4185
4186 if (d < 0)
4187 d = -d;
4188
4189 if (d > 3*sysctl_sched_latency)
4190 schedstat_inc(cfs_rq->nr_spread_over);
4191#endif
4192}
4193
4194static void
4195place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
4196{
4197 u64 vruntime = cfs_rq->min_vruntime;
4198
4199
4200
4201
4202
4203
4204
4205 if (initial && sched_feat(START_DEBIT))
4206 vruntime += sched_vslice(cfs_rq, se);
4207
4208
4209 if (!initial) {
4210 unsigned long thresh = sysctl_sched_latency;
4211
4212
4213
4214
4215
4216 if (sched_feat(GENTLE_FAIR_SLEEPERS))
4217 thresh >>= 1;
4218
4219 vruntime -= thresh;
4220 }
4221
4222
4223 se->vruntime = max_vruntime(se->vruntime, vruntime);
4224}
4225
4226static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
4227
4228static inline void check_schedstat_required(void)
4229{
4230#ifdef CONFIG_SCHEDSTATS
4231 if (schedstat_enabled())
4232 return;
4233
4234
4235 if (trace_sched_stat_wait_enabled() ||
4236 trace_sched_stat_sleep_enabled() ||
4237 trace_sched_stat_iowait_enabled() ||
4238 trace_sched_stat_blocked_enabled() ||
4239 trace_sched_stat_runtime_enabled()) {
4240 printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
4241 "stat_blocked and stat_runtime require the "
4242 "kernel parameter schedstats=enable or "
4243 "kernel.sched_schedstats=1\n");
4244 }
4245#endif
4246}
4247
4248static inline bool cfs_bandwidth_used(void);
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280static void
4281enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4282{
4283 bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
4284 bool curr = cfs_rq->curr == se;
4285
4286
4287
4288
4289
4290 if (renorm && curr)
4291 se->vruntime += cfs_rq->min_vruntime;
4292
4293 update_curr(cfs_rq);
4294
4295
4296
4297
4298
4299
4300
4301 if (renorm && !curr)
4302 se->vruntime += cfs_rq->min_vruntime;
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312 update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
4313 se_update_runnable(se);
4314 update_cfs_group(se);
4315 account_entity_enqueue(cfs_rq, se);
4316
4317 if (flags & ENQUEUE_WAKEUP)
4318 place_entity(cfs_rq, se, 0);
4319
4320 check_schedstat_required();
4321 update_stats_enqueue(cfs_rq, se, flags);
4322 check_spread(cfs_rq, se);
4323 if (!curr)
4324 __enqueue_entity(cfs_rq, se);
4325 se->on_rq = 1;
4326
4327
4328
4329
4330
4331
4332 if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
4333 list_add_leaf_cfs_rq(cfs_rq);
4334
4335 if (cfs_rq->nr_running == 1)
4336 check_enqueue_throttle(cfs_rq);
4337}
4338
4339static void __clear_buddies_last(struct sched_entity *se)
4340{
4341 for_each_sched_entity(se) {
4342 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4343 if (cfs_rq->last != se)
4344 break;
4345
4346 cfs_rq->last = NULL;
4347 }
4348}
4349
4350static void __clear_buddies_next(struct sched_entity *se)
4351{
4352 for_each_sched_entity(se) {
4353 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4354 if (cfs_rq->next != se)
4355 break;
4356
4357 cfs_rq->next = NULL;
4358 }
4359}
4360
4361static void __clear_buddies_skip(struct sched_entity *se)
4362{
4363 for_each_sched_entity(se) {
4364 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4365 if (cfs_rq->skip != se)
4366 break;
4367
4368 cfs_rq->skip = NULL;
4369 }
4370}
4371
4372static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
4373{
4374 if (cfs_rq->last == se)
4375 __clear_buddies_last(se);
4376
4377 if (cfs_rq->next == se)
4378 __clear_buddies_next(se);
4379
4380 if (cfs_rq->skip == se)
4381 __clear_buddies_skip(se);
4382}
4383
4384static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4385
4386static void
4387dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4388{
4389
4390
4391
4392 update_curr(cfs_rq);
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402 update_load_avg(cfs_rq, se, UPDATE_TG);
4403 se_update_runnable(se);
4404
4405 update_stats_dequeue(cfs_rq, se, flags);
4406
4407 clear_buddies(cfs_rq, se);
4408
4409 if (se != cfs_rq->curr)
4410 __dequeue_entity(cfs_rq, se);
4411 se->on_rq = 0;
4412 account_entity_dequeue(cfs_rq, se);
4413
4414
4415
4416
4417
4418
4419
4420 if (!(flags & DEQUEUE_SLEEP))
4421 se->vruntime -= cfs_rq->min_vruntime;
4422
4423
4424 return_cfs_rq_runtime(cfs_rq);
4425
4426 update_cfs_group(se);
4427
4428
4429
4430
4431
4432
4433
4434 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
4435 update_min_vruntime(cfs_rq);
4436}
4437
4438
4439
4440
4441static void
4442check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4443{
4444 unsigned long ideal_runtime, delta_exec;
4445 struct sched_entity *se;
4446 s64 delta;
4447
4448 ideal_runtime = sched_slice(cfs_rq, curr);
4449 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
4450 if (delta_exec > ideal_runtime) {
4451 resched_curr(rq_of(cfs_rq));
4452
4453
4454
4455
4456 clear_buddies(cfs_rq, curr);
4457 return;
4458 }
4459
4460
4461
4462
4463
4464
4465 if (delta_exec < sysctl_sched_min_granularity)
4466 return;
4467
4468 se = __pick_first_entity(cfs_rq);
4469 delta = curr->vruntime - se->vruntime;
4470
4471 if (delta < 0)
4472 return;
4473
4474 if (delta > ideal_runtime)
4475 resched_curr(rq_of(cfs_rq));
4476}
4477
4478static void
4479set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
4480{
4481 clear_buddies(cfs_rq, se);
4482
4483
4484 if (se->on_rq) {
4485
4486
4487
4488
4489
4490 update_stats_wait_end(cfs_rq, se);
4491 __dequeue_entity(cfs_rq, se);
4492 update_load_avg(cfs_rq, se, UPDATE_TG);
4493 }
4494
4495 update_stats_curr_start(cfs_rq, se);
4496 cfs_rq->curr = se;
4497
4498
4499
4500
4501
4502
4503 if (schedstat_enabled() &&
4504 rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
4505 schedstat_set(se->statistics.slice_max,
4506 max((u64)schedstat_val(se->statistics.slice_max),
4507 se->sum_exec_runtime - se->prev_sum_exec_runtime));
4508 }
4509
4510 se->prev_sum_exec_runtime = se->sum_exec_runtime;
4511}
4512
4513static int
4514wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
4515
4516
4517
4518
4519
4520
4521
4522
4523static struct sched_entity *
4524pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4525{
4526 struct sched_entity *left = __pick_first_entity(cfs_rq);
4527 struct sched_entity *se;
4528
4529
4530
4531
4532
4533 if (!left || (curr && entity_before(curr, left)))
4534 left = curr;
4535
4536 se = left;
4537
4538
4539
4540
4541
4542 if (cfs_rq->skip && cfs_rq->skip == se) {
4543 struct sched_entity *second;
4544
4545 if (se == curr) {
4546 second = __pick_first_entity(cfs_rq);
4547 } else {
4548 second = __pick_next_entity(se);
4549 if (!second || (curr && entity_before(curr, second)))
4550 second = curr;
4551 }
4552
4553 if (second && wakeup_preempt_entity(second, left) < 1)
4554 se = second;
4555 }
4556
4557 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
4558
4559
4560
4561 se = cfs_rq->next;
4562 } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
4563
4564
4565
4566 se = cfs_rq->last;
4567 }
4568
4569 return se;
4570}
4571
4572static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4573
4574static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
4575{
4576
4577
4578
4579
4580 if (prev->on_rq)
4581 update_curr(cfs_rq);
4582
4583
4584 check_cfs_rq_runtime(cfs_rq);
4585
4586 check_spread(cfs_rq, prev);
4587
4588 if (prev->on_rq) {
4589 update_stats_wait_start(cfs_rq, prev);
4590
4591 __enqueue_entity(cfs_rq, prev);
4592
4593 update_load_avg(cfs_rq, prev, 0);
4594 }
4595 cfs_rq->curr = NULL;
4596}
4597
4598static void
4599entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
4600{
4601
4602
4603
4604 update_curr(cfs_rq);
4605
4606
4607
4608
4609 update_load_avg(cfs_rq, curr, UPDATE_TG);
4610 update_cfs_group(curr);
4611
4612#ifdef CONFIG_SCHED_HRTICK
4613
4614
4615
4616
4617 if (queued) {
4618 resched_curr(rq_of(cfs_rq));
4619 return;
4620 }
4621
4622
4623
4624 if (!sched_feat(DOUBLE_TICK) &&
4625 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
4626 return;
4627#endif
4628
4629 if (cfs_rq->nr_running > 1)
4630 check_preempt_tick(cfs_rq, curr);
4631}
4632
4633
4634
4635
4636
4637
4638#ifdef CONFIG_CFS_BANDWIDTH
4639
4640#ifdef CONFIG_JUMP_LABEL
4641static struct static_key __cfs_bandwidth_used;
4642
4643static inline bool cfs_bandwidth_used(void)
4644{
4645 return static_key_false(&__cfs_bandwidth_used);
4646}
4647
4648void cfs_bandwidth_usage_inc(void)
4649{
4650 static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
4651}
4652
4653void cfs_bandwidth_usage_dec(void)
4654{
4655 static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
4656}
4657#else
4658static bool cfs_bandwidth_used(void)
4659{
4660 return true;
4661}
4662
4663void cfs_bandwidth_usage_inc(void) {}
4664void cfs_bandwidth_usage_dec(void) {}
4665#endif
4666
4667
4668
4669
4670
4671static inline u64 default_cfs_period(void)
4672{
4673 return 100000000ULL;
4674}
4675
4676static inline u64 sched_cfs_bandwidth_slice(void)
4677{
4678 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
4679}
4680
4681
4682
4683
4684
4685
4686
4687
4688void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
4689{
4690 if (unlikely(cfs_b->quota == RUNTIME_INF))
4691 return;
4692
4693 cfs_b->runtime += cfs_b->quota;
4694 cfs_b->runtime = min(cfs_b->runtime, cfs_b->quota + cfs_b->burst);
4695}
4696
4697static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4698{
4699 return &tg->cfs_bandwidth;
4700}
4701
4702
4703static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
4704 struct cfs_rq *cfs_rq, u64 target_runtime)
4705{
4706 u64 min_amount, amount = 0;
4707
4708 lockdep_assert_held(&cfs_b->lock);
4709
4710
4711 min_amount = target_runtime - cfs_rq->runtime_remaining;
4712
4713 if (cfs_b->quota == RUNTIME_INF)
4714 amount = min_amount;
4715 else {
4716 start_cfs_bandwidth(cfs_b);
4717
4718 if (cfs_b->runtime > 0) {
4719 amount = min(cfs_b->runtime, min_amount);
4720 cfs_b->runtime -= amount;
4721 cfs_b->idle = 0;
4722 }
4723 }
4724
4725 cfs_rq->runtime_remaining += amount;
4726
4727 return cfs_rq->runtime_remaining > 0;
4728}
4729
4730
4731static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4732{
4733 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4734 int ret;
4735
4736 raw_spin_lock(&cfs_b->lock);
4737 ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
4738 raw_spin_unlock(&cfs_b->lock);
4739
4740 return ret;
4741}
4742
4743static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4744{
4745
4746 cfs_rq->runtime_remaining -= delta_exec;
4747
4748 if (likely(cfs_rq->runtime_remaining > 0))
4749 return;
4750
4751 if (cfs_rq->throttled)
4752 return;
4753
4754
4755
4756
4757 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
4758 resched_curr(rq_of(cfs_rq));
4759}
4760
4761static __always_inline
4762void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4763{
4764 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
4765 return;
4766
4767 __account_cfs_rq_runtime(cfs_rq, delta_exec);
4768}
4769
4770static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4771{
4772 return cfs_bandwidth_used() && cfs_rq->throttled;
4773}
4774
4775
4776static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4777{
4778 return cfs_bandwidth_used() && cfs_rq->throttle_count;
4779}
4780
4781
4782
4783
4784
4785
4786static inline int throttled_lb_pair(struct task_group *tg,
4787 int src_cpu, int dest_cpu)
4788{
4789 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
4790
4791 src_cfs_rq = tg->cfs_rq[src_cpu];
4792 dest_cfs_rq = tg->cfs_rq[dest_cpu];
4793
4794 return throttled_hierarchy(src_cfs_rq) ||
4795 throttled_hierarchy(dest_cfs_rq);
4796}
4797
4798static int tg_unthrottle_up(struct task_group *tg, void *data)
4799{
4800 struct rq *rq = data;
4801 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4802
4803 cfs_rq->throttle_count--;
4804 if (!cfs_rq->throttle_count) {
4805 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
4806 cfs_rq->throttled_clock_task;
4807
4808
4809 if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running)
4810 list_add_leaf_cfs_rq(cfs_rq);
4811 }
4812
4813 return 0;
4814}
4815
4816static int tg_throttle_down(struct task_group *tg, void *data)
4817{
4818 struct rq *rq = data;
4819 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4820
4821
4822 if (!cfs_rq->throttle_count) {
4823 cfs_rq->throttled_clock_task = rq_clock_task(rq);
4824 list_del_leaf_cfs_rq(cfs_rq);
4825 }
4826 cfs_rq->throttle_count++;
4827
4828 return 0;
4829}
4830
4831static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
4832{
4833 struct rq *rq = rq_of(cfs_rq);
4834 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4835 struct sched_entity *se;
4836 long task_delta, idle_task_delta, dequeue = 1;
4837
4838 raw_spin_lock(&cfs_b->lock);
4839
4840 if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
4841
4842
4843
4844
4845
4846
4847
4848
4849 dequeue = 0;
4850 } else {
4851 list_add_tail_rcu(&cfs_rq->throttled_list,
4852 &cfs_b->throttled_cfs_rq);
4853 }
4854 raw_spin_unlock(&cfs_b->lock);
4855
4856 if (!dequeue)
4857 return false;
4858
4859 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
4860
4861
4862 rcu_read_lock();
4863 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
4864 rcu_read_unlock();
4865
4866 task_delta = cfs_rq->h_nr_running;
4867 idle_task_delta = cfs_rq->idle_h_nr_running;
4868 for_each_sched_entity(se) {
4869 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4870
4871 if (!se->on_rq)
4872 goto done;
4873
4874 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
4875
4876 if (cfs_rq_is_idle(group_cfs_rq(se)))
4877 idle_task_delta = cfs_rq->h_nr_running;
4878
4879 qcfs_rq->h_nr_running -= task_delta;
4880 qcfs_rq->idle_h_nr_running -= idle_task_delta;
4881
4882 if (qcfs_rq->load.weight) {
4883
4884 se = parent_entity(se);
4885 break;
4886 }
4887 }
4888
4889 for_each_sched_entity(se) {
4890 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4891
4892 if (!se->on_rq)
4893 goto done;
4894
4895 update_load_avg(qcfs_rq, se, 0);
4896 se_update_runnable(se);
4897
4898 if (cfs_rq_is_idle(group_cfs_rq(se)))
4899 idle_task_delta = cfs_rq->h_nr_running;
4900
4901 qcfs_rq->h_nr_running -= task_delta;
4902 qcfs_rq->idle_h_nr_running -= idle_task_delta;
4903 }
4904
4905
4906 sub_nr_running(rq, task_delta);
4907
4908done:
4909
4910
4911
4912
4913 cfs_rq->throttled = 1;
4914 cfs_rq->throttled_clock = rq_clock(rq);
4915 return true;
4916}
4917
4918void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
4919{
4920 struct rq *rq = rq_of(cfs_rq);
4921 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4922 struct sched_entity *se;
4923 long task_delta, idle_task_delta;
4924
4925 se = cfs_rq->tg->se[cpu_of(rq)];
4926
4927 cfs_rq->throttled = 0;
4928
4929 update_rq_clock(rq);
4930
4931 raw_spin_lock(&cfs_b->lock);
4932 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
4933 list_del_rcu(&cfs_rq->throttled_list);
4934 raw_spin_unlock(&cfs_b->lock);
4935
4936
4937 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
4938
4939
4940 if (!cfs_rq->load.weight) {
4941 if (cfs_rq->on_list)
4942 goto unthrottle_throttle;
4943 return;
4944 }
4945
4946 task_delta = cfs_rq->h_nr_running;
4947 idle_task_delta = cfs_rq->idle_h_nr_running;
4948 for_each_sched_entity(se) {
4949 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4950
4951 if (se->on_rq)
4952 break;
4953 enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
4954
4955 if (cfs_rq_is_idle(group_cfs_rq(se)))
4956 idle_task_delta = cfs_rq->h_nr_running;
4957
4958 qcfs_rq->h_nr_running += task_delta;
4959 qcfs_rq->idle_h_nr_running += idle_task_delta;
4960
4961
4962 if (cfs_rq_throttled(qcfs_rq))
4963 goto unthrottle_throttle;
4964 }
4965
4966 for_each_sched_entity(se) {
4967 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4968
4969 update_load_avg(qcfs_rq, se, UPDATE_TG);
4970 se_update_runnable(se);
4971
4972 if (cfs_rq_is_idle(group_cfs_rq(se)))
4973 idle_task_delta = cfs_rq->h_nr_running;
4974
4975 qcfs_rq->h_nr_running += task_delta;
4976 qcfs_rq->idle_h_nr_running += idle_task_delta;
4977
4978
4979 if (cfs_rq_throttled(qcfs_rq))
4980 goto unthrottle_throttle;
4981
4982
4983
4984
4985
4986 if (throttled_hierarchy(qcfs_rq))
4987 list_add_leaf_cfs_rq(qcfs_rq);
4988 }
4989
4990
4991 add_nr_running(rq, task_delta);
4992
4993unthrottle_throttle:
4994
4995
4996
4997
4998
4999 for_each_sched_entity(se) {
5000 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
5001
5002 if (list_add_leaf_cfs_rq(qcfs_rq))
5003 break;
5004 }
5005
5006 assert_list_leaf_cfs_rq(rq);
5007
5008
5009 if (rq->curr == rq->idle && rq->cfs.nr_running)
5010 resched_curr(rq);
5011}
5012
5013static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
5014{
5015 struct cfs_rq *cfs_rq;
5016 u64 runtime, remaining = 1;
5017
5018 rcu_read_lock();
5019 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
5020 throttled_list) {
5021 struct rq *rq = rq_of(cfs_rq);
5022 struct rq_flags rf;
5023
5024 rq_lock_irqsave(rq, &rf);
5025 if (!cfs_rq_throttled(cfs_rq))
5026 goto next;
5027
5028
5029 SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
5030
5031 raw_spin_lock(&cfs_b->lock);
5032 runtime = -cfs_rq->runtime_remaining + 1;
5033 if (runtime > cfs_b->runtime)
5034 runtime = cfs_b->runtime;
5035 cfs_b->runtime -= runtime;
5036 remaining = cfs_b->runtime;
5037 raw_spin_unlock(&cfs_b->lock);
5038
5039 cfs_rq->runtime_remaining += runtime;
5040
5041
5042 if (cfs_rq->runtime_remaining > 0)
5043 unthrottle_cfs_rq(cfs_rq);
5044
5045next:
5046 rq_unlock_irqrestore(rq, &rf);
5047
5048 if (!remaining)
5049 break;
5050 }
5051 rcu_read_unlock();
5052}
5053
5054
5055
5056
5057
5058
5059
5060static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
5061{
5062 int throttled;
5063
5064
5065 if (cfs_b->quota == RUNTIME_INF)
5066 goto out_deactivate;
5067
5068 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
5069 cfs_b->nr_periods += overrun;
5070
5071
5072 __refill_cfs_bandwidth_runtime(cfs_b);
5073
5074
5075
5076
5077
5078 if (cfs_b->idle && !throttled)
5079 goto out_deactivate;
5080
5081 if (!throttled) {
5082
5083 cfs_b->idle = 1;
5084 return 0;
5085 }
5086
5087
5088 cfs_b->nr_throttled += overrun;
5089
5090
5091
5092
5093 while (throttled && cfs_b->runtime > 0) {
5094 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
5095
5096 distribute_cfs_runtime(cfs_b);
5097 raw_spin_lock_irqsave(&cfs_b->lock, flags);
5098
5099 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
5100 }
5101
5102
5103
5104
5105
5106
5107
5108 cfs_b->idle = 0;
5109
5110 return 0;
5111
5112out_deactivate:
5113 return 1;
5114}
5115
5116
5117static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
5118
5119static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
5120
5121static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
5122
5123
5124
5125
5126
5127
5128
5129
5130static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
5131{
5132 struct hrtimer *refresh_timer = &cfs_b->period_timer;
5133 s64 remaining;
5134
5135
5136 if (hrtimer_callback_running(refresh_timer))
5137 return 1;
5138
5139
5140 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
5141 if (remaining < (s64)min_expire)
5142 return 1;
5143
5144 return 0;
5145}
5146
5147static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
5148{
5149 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
5150
5151
5152 if (runtime_refresh_within(cfs_b, min_left))
5153 return;
5154
5155
5156 if (cfs_b->slack_started)
5157 return;
5158 cfs_b->slack_started = true;
5159
5160 hrtimer_start(&cfs_b->slack_timer,
5161 ns_to_ktime(cfs_bandwidth_slack_period),
5162 HRTIMER_MODE_REL);
5163}
5164
5165
5166static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5167{
5168 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5169 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
5170
5171 if (slack_runtime <= 0)
5172 return;
5173
5174 raw_spin_lock(&cfs_b->lock);
5175 if (cfs_b->quota != RUNTIME_INF) {
5176 cfs_b->runtime += slack_runtime;
5177
5178
5179 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
5180 !list_empty(&cfs_b->throttled_cfs_rq))
5181 start_cfs_slack_bandwidth(cfs_b);
5182 }
5183 raw_spin_unlock(&cfs_b->lock);
5184
5185
5186 cfs_rq->runtime_remaining -= slack_runtime;
5187}
5188
5189static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5190{
5191 if (!cfs_bandwidth_used())
5192 return;
5193
5194 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
5195 return;
5196
5197 __return_cfs_rq_runtime(cfs_rq);
5198}
5199
5200
5201
5202
5203
5204static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
5205{
5206 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
5207 unsigned long flags;
5208
5209
5210 raw_spin_lock_irqsave(&cfs_b->lock, flags);
5211 cfs_b->slack_started = false;
5212
5213 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
5214 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
5215 return;
5216 }
5217
5218 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
5219 runtime = cfs_b->runtime;
5220
5221 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
5222
5223 if (!runtime)
5224 return;
5225
5226 distribute_cfs_runtime(cfs_b);
5227}
5228
5229
5230
5231
5232
5233
5234static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
5235{
5236 if (!cfs_bandwidth_used())
5237 return;
5238
5239
5240 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
5241 return;
5242
5243
5244 if (cfs_rq_throttled(cfs_rq))
5245 return;
5246
5247
5248 account_cfs_rq_runtime(cfs_rq, 0);
5249 if (cfs_rq->runtime_remaining <= 0)
5250 throttle_cfs_rq(cfs_rq);
5251}
5252
5253static void sync_throttle(struct task_group *tg, int cpu)
5254{
5255 struct cfs_rq *pcfs_rq, *cfs_rq;
5256
5257 if (!cfs_bandwidth_used())
5258 return;
5259
5260 if (!tg->parent)
5261 return;
5262
5263 cfs_rq = tg->cfs_rq[cpu];
5264 pcfs_rq = tg->parent->cfs_rq[cpu];
5265
5266 cfs_rq->throttle_count = pcfs_rq->throttle_count;
5267 cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
5268}
5269
5270
5271static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5272{
5273 if (!cfs_bandwidth_used())
5274 return false;
5275
5276 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
5277 return false;
5278
5279
5280
5281
5282
5283 if (cfs_rq_throttled(cfs_rq))
5284 return true;
5285
5286 return throttle_cfs_rq(cfs_rq);
5287}
5288
5289static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
5290{
5291 struct cfs_bandwidth *cfs_b =
5292 container_of(timer, struct cfs_bandwidth, slack_timer);
5293
5294 do_sched_cfs_slack_timer(cfs_b);
5295
5296 return HRTIMER_NORESTART;
5297}
5298
5299extern const u64 max_cfs_quota_period;
5300
5301static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
5302{
5303 struct cfs_bandwidth *cfs_b =
5304 container_of(timer, struct cfs_bandwidth, period_timer);
5305 unsigned long flags;
5306 int overrun;
5307 int idle = 0;
5308 int count = 0;
5309
5310 raw_spin_lock_irqsave(&cfs_b->lock, flags);
5311 for (;;) {
5312 overrun = hrtimer_forward_now(timer, cfs_b->period);
5313 if (!overrun)
5314 break;
5315
5316 idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
5317
5318 if (++count > 3) {
5319 u64 new, old = ktime_to_ns(cfs_b->period);
5320
5321
5322
5323
5324
5325
5326 new = old * 2;
5327 if (new < max_cfs_quota_period) {
5328 cfs_b->period = ns_to_ktime(new);
5329 cfs_b->quota *= 2;
5330 cfs_b->burst *= 2;
5331
5332 pr_warn_ratelimited(
5333 "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
5334 smp_processor_id(),
5335 div_u64(new, NSEC_PER_USEC),
5336 div_u64(cfs_b->quota, NSEC_PER_USEC));
5337 } else {
5338 pr_warn_ratelimited(
5339 "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
5340 smp_processor_id(),
5341 div_u64(old, NSEC_PER_USEC),
5342 div_u64(cfs_b->quota, NSEC_PER_USEC));
5343 }
5344
5345
5346 count = 0;
5347 }
5348 }
5349 if (idle)
5350 cfs_b->period_active = 0;
5351 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
5352
5353 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
5354}
5355
5356void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5357{
5358 raw_spin_lock_init(&cfs_b->lock);
5359 cfs_b->runtime = 0;
5360 cfs_b->quota = RUNTIME_INF;
5361 cfs_b->period = ns_to_ktime(default_cfs_period());
5362 cfs_b->burst = 0;
5363
5364 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
5365 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
5366 cfs_b->period_timer.function = sched_cfs_period_timer;
5367 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5368 cfs_b->slack_timer.function = sched_cfs_slack_timer;
5369 cfs_b->slack_started = false;
5370}
5371
5372static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5373{
5374 cfs_rq->runtime_enabled = 0;
5375 INIT_LIST_HEAD(&cfs_rq->throttled_list);
5376}
5377
5378void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5379{
5380 lockdep_assert_held(&cfs_b->lock);
5381
5382 if (cfs_b->period_active)
5383 return;
5384
5385 cfs_b->period_active = 1;
5386 hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
5387 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
5388}
5389
5390static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5391{
5392
5393 if (!cfs_b->throttled_cfs_rq.next)
5394 return;
5395
5396 hrtimer_cancel(&cfs_b->period_timer);
5397 hrtimer_cancel(&cfs_b->slack_timer);
5398}
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408static void __maybe_unused update_runtime_enabled(struct rq *rq)
5409{
5410 struct task_group *tg;
5411
5412 lockdep_assert_rq_held(rq);
5413
5414 rcu_read_lock();
5415 list_for_each_entry_rcu(tg, &task_groups, list) {
5416 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
5417 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5418
5419 raw_spin_lock(&cfs_b->lock);
5420 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
5421 raw_spin_unlock(&cfs_b->lock);
5422 }
5423 rcu_read_unlock();
5424}
5425
5426
5427static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
5428{
5429 struct task_group *tg;
5430
5431 lockdep_assert_rq_held(rq);
5432
5433 rcu_read_lock();
5434 list_for_each_entry_rcu(tg, &task_groups, list) {
5435 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5436
5437 if (!cfs_rq->runtime_enabled)
5438 continue;
5439
5440
5441
5442
5443
5444 cfs_rq->runtime_remaining = 1;
5445
5446
5447
5448
5449 cfs_rq->runtime_enabled = 0;
5450
5451 if (cfs_rq_throttled(cfs_rq))
5452 unthrottle_cfs_rq(cfs_rq);
5453 }
5454 rcu_read_unlock();
5455}
5456
5457#else
5458
5459static inline bool cfs_bandwidth_used(void)
5460{
5461 return false;
5462}
5463
5464static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
5465static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
5466static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
5467static inline void sync_throttle(struct task_group *tg, int cpu) {}
5468static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5469
5470static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5471{
5472 return 0;
5473}
5474
5475static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5476{
5477 return 0;
5478}
5479
5480static inline int throttled_lb_pair(struct task_group *tg,
5481 int src_cpu, int dest_cpu)
5482{
5483 return 0;
5484}
5485
5486void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5487
5488#ifdef CONFIG_FAIR_GROUP_SCHED
5489static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5490#endif
5491
5492static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
5493{
5494 return NULL;
5495}
5496static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5497static inline void update_runtime_enabled(struct rq *rq) {}
5498static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
5499
5500#endif
5501
5502
5503
5504
5505
5506#ifdef CONFIG_SCHED_HRTICK
5507static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
5508{
5509 struct sched_entity *se = &p->se;
5510 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5511
5512 SCHED_WARN_ON(task_rq(p) != rq);
5513
5514 if (rq->cfs.h_nr_running > 1) {
5515 u64 slice = sched_slice(cfs_rq, se);
5516 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
5517 s64 delta = slice - ran;
5518
5519 if (delta < 0) {
5520 if (task_current(rq, p))
5521 resched_curr(rq);
5522 return;
5523 }
5524 hrtick_start(rq, delta);
5525 }
5526}
5527
5528
5529
5530
5531
5532
5533static void hrtick_update(struct rq *rq)
5534{
5535 struct task_struct *curr = rq->curr;
5536
5537 if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class)
5538 return;
5539
5540 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
5541 hrtick_start_fair(rq, curr);
5542}
5543#else
5544static inline void
5545hrtick_start_fair(struct rq *rq, struct task_struct *p)
5546{
5547}
5548
5549static inline void hrtick_update(struct rq *rq)
5550{
5551}
5552#endif
5553
5554#ifdef CONFIG_SMP
5555static inline unsigned long cpu_util(int cpu);
5556
5557static inline bool cpu_overutilized(int cpu)
5558{
5559 return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
5560}
5561
5562static inline void update_overutilized_status(struct rq *rq)
5563{
5564 if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
5565 WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
5566 trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
5567 }
5568}
5569#else
5570static inline void update_overutilized_status(struct rq *rq) { }
5571#endif
5572
5573
5574static int sched_idle_rq(struct rq *rq)
5575{
5576 return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
5577 rq->nr_running);
5578}
5579
5580#ifdef CONFIG_SMP
5581static int sched_idle_cpu(int cpu)
5582{
5583 return sched_idle_rq(cpu_rq(cpu));
5584}
5585#endif
5586
5587
5588
5589
5590
5591
5592static void
5593enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5594{
5595 struct cfs_rq *cfs_rq;
5596 struct sched_entity *se = &p->se;
5597 int idle_h_nr_running = task_has_idle_policy(p);
5598 int task_new = !(flags & ENQUEUE_WAKEUP);
5599
5600
5601
5602
5603
5604
5605
5606 util_est_enqueue(&rq->cfs, p);
5607
5608
5609
5610
5611
5612
5613 if (p->in_iowait)
5614 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
5615
5616 for_each_sched_entity(se) {
5617 if (se->on_rq)
5618 break;
5619 cfs_rq = cfs_rq_of(se);
5620 enqueue_entity(cfs_rq, se, flags);
5621
5622 cfs_rq->h_nr_running++;
5623 cfs_rq->idle_h_nr_running += idle_h_nr_running;
5624
5625 if (cfs_rq_is_idle(cfs_rq))
5626 idle_h_nr_running = 1;
5627
5628
5629 if (cfs_rq_throttled(cfs_rq))
5630 goto enqueue_throttle;
5631
5632 flags = ENQUEUE_WAKEUP;
5633 }
5634
5635 for_each_sched_entity(se) {
5636 cfs_rq = cfs_rq_of(se);
5637
5638 update_load_avg(cfs_rq, se, UPDATE_TG);
5639 se_update_runnable(se);
5640 update_cfs_group(se);
5641
5642 cfs_rq->h_nr_running++;
5643 cfs_rq->idle_h_nr_running += idle_h_nr_running;
5644
5645 if (cfs_rq_is_idle(cfs_rq))
5646 idle_h_nr_running = 1;
5647
5648
5649 if (cfs_rq_throttled(cfs_rq))
5650 goto enqueue_throttle;
5651
5652
5653
5654
5655
5656 if (throttled_hierarchy(cfs_rq))
5657 list_add_leaf_cfs_rq(cfs_rq);
5658 }
5659
5660
5661 add_nr_running(rq, 1);
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677 if (!task_new)
5678 update_overutilized_status(rq);
5679
5680enqueue_throttle:
5681 if (cfs_bandwidth_used()) {
5682
5683
5684
5685
5686
5687
5688 for_each_sched_entity(se) {
5689 cfs_rq = cfs_rq_of(se);
5690
5691 if (list_add_leaf_cfs_rq(cfs_rq))
5692 break;
5693 }
5694 }
5695
5696 assert_list_leaf_cfs_rq(rq);
5697
5698 hrtick_update(rq);
5699}
5700
5701static void set_next_buddy(struct sched_entity *se);
5702
5703
5704
5705
5706
5707
5708static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5709{
5710 struct cfs_rq *cfs_rq;
5711 struct sched_entity *se = &p->se;
5712 int task_sleep = flags & DEQUEUE_SLEEP;
5713 int idle_h_nr_running = task_has_idle_policy(p);
5714 bool was_sched_idle = sched_idle_rq(rq);
5715
5716 util_est_dequeue(&rq->cfs, p);
5717
5718 for_each_sched_entity(se) {
5719 cfs_rq = cfs_rq_of(se);
5720 dequeue_entity(cfs_rq, se, flags);
5721
5722 cfs_rq->h_nr_running--;
5723 cfs_rq->idle_h_nr_running -= idle_h_nr_running;
5724
5725 if (cfs_rq_is_idle(cfs_rq))
5726 idle_h_nr_running = 1;
5727
5728
5729 if (cfs_rq_throttled(cfs_rq))
5730 goto dequeue_throttle;
5731
5732
5733 if (cfs_rq->load.weight) {
5734
5735 se = parent_entity(se);
5736
5737
5738
5739
5740 if (task_sleep && se && !throttled_hierarchy(cfs_rq))
5741 set_next_buddy(se);
5742 break;
5743 }
5744 flags |= DEQUEUE_SLEEP;
5745 }
5746
5747 for_each_sched_entity(se) {
5748 cfs_rq = cfs_rq_of(se);
5749
5750 update_load_avg(cfs_rq, se, UPDATE_TG);
5751 se_update_runnable(se);
5752 update_cfs_group(se);
5753
5754 cfs_rq->h_nr_running--;
5755 cfs_rq->idle_h_nr_running -= idle_h_nr_running;
5756
5757 if (cfs_rq_is_idle(cfs_rq))
5758 idle_h_nr_running = 1;
5759
5760
5761 if (cfs_rq_throttled(cfs_rq))
5762 goto dequeue_throttle;
5763
5764 }
5765
5766
5767 sub_nr_running(rq, 1);
5768
5769
5770 if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
5771 rq->next_balance = jiffies;
5772
5773dequeue_throttle:
5774 util_est_update(&rq->cfs, p, task_sleep);
5775 hrtick_update(rq);
5776}
5777
5778#ifdef CONFIG_SMP
5779
5780
5781DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
5782DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
5783
5784#ifdef CONFIG_NO_HZ_COMMON
5785
5786static struct {
5787 cpumask_var_t idle_cpus_mask;
5788 atomic_t nr_cpus;
5789 int has_blocked;
5790 unsigned long next_balance;
5791 unsigned long next_blocked;
5792} nohz ____cacheline_aligned;
5793
5794#endif
5795
5796static unsigned long cpu_load(struct rq *rq)
5797{
5798 return cfs_rq_load_avg(&rq->cfs);
5799}
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
5815{
5816 struct cfs_rq *cfs_rq;
5817 unsigned int load;
5818
5819
5820 if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
5821 return cpu_load(rq);
5822
5823 cfs_rq = &rq->cfs;
5824 load = READ_ONCE(cfs_rq->avg.load_avg);
5825
5826
5827 lsub_positive(&load, task_h_load(p));
5828
5829 return load;
5830}
5831
5832static unsigned long cpu_runnable(struct rq *rq)
5833{
5834 return cfs_rq_runnable_avg(&rq->cfs);
5835}
5836
5837static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p)
5838{
5839 struct cfs_rq *cfs_rq;
5840 unsigned int runnable;
5841
5842
5843 if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
5844 return cpu_runnable(rq);
5845
5846 cfs_rq = &rq->cfs;
5847 runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
5848
5849
5850 lsub_positive(&runnable, p->se.avg.runnable_avg);
5851
5852 return runnable;
5853}
5854
5855static unsigned long capacity_of(int cpu)
5856{
5857 return cpu_rq(cpu)->cpu_capacity;
5858}
5859
5860static void record_wakee(struct task_struct *p)
5861{
5862
5863
5864
5865
5866 if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
5867 current->wakee_flips >>= 1;
5868 current->wakee_flip_decay_ts = jiffies;
5869 }
5870
5871 if (current->last_wakee != p) {
5872 current->last_wakee = p;
5873 current->wakee_flips++;
5874 }
5875}
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894static int wake_wide(struct task_struct *p)
5895{
5896 unsigned int master = current->wakee_flips;
5897 unsigned int slave = p->wakee_flips;
5898 int factor = __this_cpu_read(sd_llc_size);
5899
5900 if (master < slave)
5901 swap(master, slave);
5902 if (slave < factor || master < slave * factor)
5903 return 0;
5904 return 1;
5905}
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919static int
5920wake_affine_idle(int this_cpu, int prev_cpu, int sync)
5921{
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934 if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
5935 return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
5936
5937 if (sync && cpu_rq(this_cpu)->nr_running == 1)
5938 return this_cpu;
5939
5940 if (available_idle_cpu(prev_cpu))
5941 return prev_cpu;
5942
5943 return nr_cpumask_bits;
5944}
5945
5946static int
5947wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5948 int this_cpu, int prev_cpu, int sync)
5949{
5950 s64 this_eff_load, prev_eff_load;
5951 unsigned long task_load;
5952
5953 this_eff_load = cpu_load(cpu_rq(this_cpu));
5954
5955 if (sync) {
5956 unsigned long current_load = task_h_load(current);
5957
5958 if (current_load > this_eff_load)
5959 return this_cpu;
5960
5961 this_eff_load -= current_load;
5962 }
5963
5964 task_load = task_h_load(p);
5965
5966 this_eff_load += task_load;
5967 if (sched_feat(WA_BIAS))
5968 this_eff_load *= 100;
5969 this_eff_load *= capacity_of(prev_cpu);
5970
5971 prev_eff_load = cpu_load(cpu_rq(prev_cpu));
5972 prev_eff_load -= task_load;
5973 if (sched_feat(WA_BIAS))
5974 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
5975 prev_eff_load *= capacity_of(this_cpu);
5976
5977
5978
5979
5980
5981
5982
5983 if (sync)
5984 prev_eff_load += 1;
5985
5986 return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
5987}
5988
5989static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5990 int this_cpu, int prev_cpu, int sync)
5991{
5992 int target = nr_cpumask_bits;
5993
5994 if (sched_feat(WA_IDLE))
5995 target = wake_affine_idle(this_cpu, prev_cpu, sync);
5996
5997 if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
5998 target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
5999
6000 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
6001 if (target == nr_cpumask_bits)
6002 return prev_cpu;
6003
6004 schedstat_inc(sd->ttwu_move_affine);
6005 schedstat_inc(p->se.statistics.nr_wakeups_affine);
6006 return target;
6007}
6008
6009static struct sched_group *
6010find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
6011
6012
6013
6014
6015static int
6016find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
6017{
6018 unsigned long load, min_load = ULONG_MAX;
6019 unsigned int min_exit_latency = UINT_MAX;
6020 u64 latest_idle_timestamp = 0;
6021 int least_loaded_cpu = this_cpu;
6022 int shallowest_idle_cpu = -1;
6023 int i;
6024
6025
6026 if (group->group_weight == 1)
6027 return cpumask_first(sched_group_span(group));
6028
6029
6030 for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
6031 struct rq *rq = cpu_rq(i);
6032
6033 if (!sched_core_cookie_match(rq, p))
6034 continue;
6035
6036 if (sched_idle_cpu(i))
6037 return i;
6038
6039 if (available_idle_cpu(i)) {
6040 struct cpuidle_state *idle = idle_get_state(rq);
6041 if (idle && idle->exit_latency < min_exit_latency) {
6042
6043
6044
6045
6046
6047 min_exit_latency = idle->exit_latency;
6048 latest_idle_timestamp = rq->idle_stamp;
6049 shallowest_idle_cpu = i;
6050 } else if ((!idle || idle->exit_latency == min_exit_latency) &&
6051 rq->idle_stamp > latest_idle_timestamp) {
6052
6053
6054
6055
6056
6057 latest_idle_timestamp = rq->idle_stamp;
6058 shallowest_idle_cpu = i;
6059 }
6060 } else if (shallowest_idle_cpu == -1) {
6061 load = cpu_load(cpu_rq(i));
6062 if (load < min_load) {
6063 min_load = load;
6064 least_loaded_cpu = i;
6065 }
6066 }
6067 }
6068
6069 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
6070}
6071
6072static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
6073 int cpu, int prev_cpu, int sd_flag)
6074{
6075 int new_cpu = cpu;
6076
6077 if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
6078 return prev_cpu;
6079
6080
6081
6082
6083
6084 if (!(sd_flag & SD_BALANCE_FORK))
6085 sync_entity_load_avg(&p->se);
6086
6087 while (sd) {
6088 struct sched_group *group;
6089 struct sched_domain *tmp;
6090 int weight;
6091
6092 if (!(sd->flags & sd_flag)) {
6093 sd = sd->child;
6094 continue;
6095 }
6096
6097 group = find_idlest_group(sd, p, cpu);
6098 if (!group) {
6099 sd = sd->child;
6100 continue;
6101 }
6102
6103 new_cpu = find_idlest_group_cpu(group, p, cpu);
6104 if (new_cpu == cpu) {
6105
6106 sd = sd->child;
6107 continue;
6108 }
6109
6110
6111 cpu = new_cpu;
6112 weight = sd->span_weight;
6113 sd = NULL;
6114 for_each_domain(cpu, tmp) {
6115 if (weight <= tmp->span_weight)
6116 break;
6117 if (tmp->flags & sd_flag)
6118 sd = tmp;
6119 }
6120 }
6121
6122 return new_cpu;
6123}
6124
6125static inline int __select_idle_cpu(int cpu, struct task_struct *p)
6126{
6127 if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
6128 sched_cpu_cookie_match(cpu_rq(cpu), p))
6129 return cpu;
6130
6131 return -1;
6132}
6133
6134#ifdef CONFIG_SCHED_SMT
6135DEFINE_STATIC_KEY_FALSE(sched_smt_present);
6136EXPORT_SYMBOL_GPL(sched_smt_present);
6137
6138static inline void set_idle_cores(int cpu, int val)
6139{
6140 struct sched_domain_shared *sds;
6141
6142 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
6143 if (sds)
6144 WRITE_ONCE(sds->has_idle_cores, val);
6145}
6146
6147static inline bool test_idle_cores(int cpu, bool def)
6148{
6149 struct sched_domain_shared *sds;
6150
6151 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
6152 if (sds)
6153 return READ_ONCE(sds->has_idle_cores);
6154
6155 return def;
6156}
6157
6158
6159
6160
6161
6162
6163
6164
6165void __update_idle_core(struct rq *rq)
6166{
6167 int core = cpu_of(rq);
6168 int cpu;
6169
6170 rcu_read_lock();
6171 if (test_idle_cores(core, true))
6172 goto unlock;
6173
6174 for_each_cpu(cpu, cpu_smt_mask(core)) {
6175 if (cpu == core)
6176 continue;
6177
6178 if (!available_idle_cpu(cpu))
6179 goto unlock;
6180 }
6181
6182 set_idle_cores(core, 1);
6183unlock:
6184 rcu_read_unlock();
6185}
6186
6187
6188
6189
6190
6191
6192static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
6193{
6194 bool idle = true;
6195 int cpu;
6196
6197 if (!static_branch_likely(&sched_smt_present))
6198 return __select_idle_cpu(core, p);
6199
6200 for_each_cpu(cpu, cpu_smt_mask(core)) {
6201 if (!available_idle_cpu(cpu)) {
6202 idle = false;
6203 if (*idle_cpu == -1) {
6204 if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) {
6205 *idle_cpu = cpu;
6206 break;
6207 }
6208 continue;
6209 }
6210 break;
6211 }
6212 if (*idle_cpu == -1 && cpumask_test_cpu(cpu, p->cpus_ptr))
6213 *idle_cpu = cpu;
6214 }
6215
6216 if (idle)
6217 return core;
6218
6219 cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
6220 return -1;
6221}
6222
6223
6224
6225
6226static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
6227{
6228 int cpu;
6229
6230 for_each_cpu(cpu, cpu_smt_mask(target)) {
6231 if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
6232 !cpumask_test_cpu(cpu, sched_domain_span(sd)))
6233 continue;
6234 if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
6235 return cpu;
6236 }
6237
6238 return -1;
6239}
6240
6241#else
6242
6243static inline void set_idle_cores(int cpu, int val)
6244{
6245}
6246
6247static inline bool test_idle_cores(int cpu, bool def)
6248{
6249 return def;
6250}
6251
6252static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
6253{
6254 return __select_idle_cpu(core, p);
6255}
6256
6257static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
6258{
6259 return -1;
6260}
6261
6262#endif
6263
6264
6265
6266
6267
6268
6269static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target)
6270{
6271 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
6272 int i, cpu, idle_cpu = -1, nr = INT_MAX;
6273 struct rq *this_rq = this_rq();
6274 int this = smp_processor_id();
6275 struct sched_domain *this_sd;
6276 u64 time = 0;
6277
6278 this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
6279 if (!this_sd)
6280 return -1;
6281
6282 cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
6283
6284 if (sched_feat(SIS_PROP) && !has_idle_core) {
6285 u64 avg_cost, avg_idle, span_avg;
6286 unsigned long now = jiffies;
6287
6288
6289
6290
6291
6292
6293 if (unlikely(this_rq->wake_stamp < now)) {
6294 while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) {
6295 this_rq->wake_stamp++;
6296 this_rq->wake_avg_idle >>= 1;
6297 }
6298 }
6299
6300 avg_idle = this_rq->wake_avg_idle;
6301 avg_cost = this_sd->avg_scan_cost + 1;
6302
6303 span_avg = sd->span_weight * avg_idle;
6304 if (span_avg > 4*avg_cost)
6305 nr = div_u64(span_avg, avg_cost);
6306 else
6307 nr = 4;
6308
6309 time = cpu_clock(this);
6310 }
6311
6312 for_each_cpu_wrap(cpu, cpus, target + 1) {
6313 if (has_idle_core) {
6314 i = select_idle_core(p, cpu, cpus, &idle_cpu);
6315 if ((unsigned int)i < nr_cpumask_bits)
6316 return i;
6317
6318 } else {
6319 if (!--nr)
6320 return -1;
6321 idle_cpu = __select_idle_cpu(cpu, p);
6322 if ((unsigned int)idle_cpu < nr_cpumask_bits)
6323 break;
6324 }
6325 }
6326
6327 if (has_idle_core)
6328 set_idle_cores(target, false);
6329
6330 if (sched_feat(SIS_PROP) && !has_idle_core) {
6331 time = cpu_clock(this) - time;
6332
6333
6334
6335
6336
6337 this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time);
6338
6339 update_avg(&this_sd->avg_scan_cost, time);
6340 }
6341
6342 return idle_cpu;
6343}
6344
6345
6346
6347
6348
6349
6350static int
6351select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
6352{
6353 unsigned long task_util, best_cap = 0;
6354 int cpu, best_cpu = -1;
6355 struct cpumask *cpus;
6356
6357 cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
6358 cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
6359
6360 task_util = uclamp_task_util(p);
6361
6362 for_each_cpu_wrap(cpu, cpus, target) {
6363 unsigned long cpu_cap = capacity_of(cpu);
6364
6365 if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
6366 continue;
6367 if (fits_capacity(task_util, cpu_cap))
6368 return cpu;
6369
6370 if (cpu_cap > best_cap) {
6371 best_cap = cpu_cap;
6372 best_cpu = cpu;
6373 }
6374 }
6375
6376 return best_cpu;
6377}
6378
6379static inline bool asym_fits_capacity(int task_util, int cpu)
6380{
6381 if (static_branch_unlikely(&sched_asym_cpucapacity))
6382 return fits_capacity(task_util, capacity_of(cpu));
6383
6384 return true;
6385}
6386
6387
6388
6389
6390static int select_idle_sibling(struct task_struct *p, int prev, int target)
6391{
6392 bool has_idle_core = false;
6393 struct sched_domain *sd;
6394 unsigned long task_util;
6395 int i, recent_used_cpu;
6396
6397
6398
6399
6400
6401 if (static_branch_unlikely(&sched_asym_cpucapacity)) {
6402 sync_entity_load_avg(&p->se);
6403 task_util = uclamp_task_util(p);
6404 }
6405
6406
6407
6408
6409 lockdep_assert_irqs_disabled();
6410
6411 if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
6412 asym_fits_capacity(task_util, target))
6413 return target;
6414
6415
6416
6417
6418 if (prev != target && cpus_share_cache(prev, target) &&
6419 (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
6420 asym_fits_capacity(task_util, prev))
6421 return prev;
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431 if (is_per_cpu_kthread(current) &&
6432 prev == smp_processor_id() &&
6433 this_rq()->nr_running <= 1) {
6434 return prev;
6435 }
6436
6437
6438 recent_used_cpu = p->recent_used_cpu;
6439 p->recent_used_cpu = prev;
6440 if (recent_used_cpu != prev &&
6441 recent_used_cpu != target &&
6442 cpus_share_cache(recent_used_cpu, target) &&
6443 (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
6444 cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
6445 asym_fits_capacity(task_util, recent_used_cpu)) {
6446
6447
6448
6449
6450 p->recent_used_cpu = prev;
6451 return recent_used_cpu;
6452 }
6453
6454
6455
6456
6457
6458 if (static_branch_unlikely(&sched_asym_cpucapacity)) {
6459 sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
6460
6461
6462
6463
6464
6465
6466
6467
6468 if (sd) {
6469 i = select_idle_capacity(p, sd, target);
6470 return ((unsigned)i < nr_cpumask_bits) ? i : target;
6471 }
6472 }
6473
6474 sd = rcu_dereference(per_cpu(sd_llc, target));
6475 if (!sd)
6476 return target;
6477
6478 if (sched_smt_active()) {
6479 has_idle_core = test_idle_cores(target, false);
6480
6481 if (!has_idle_core && cpus_share_cache(prev, target)) {
6482 i = select_idle_smt(p, sd, prev);
6483 if ((unsigned int)i < nr_cpumask_bits)
6484 return i;
6485 }
6486 }
6487
6488 i = select_idle_cpu(p, sd, has_idle_core, target);
6489 if ((unsigned)i < nr_cpumask_bits)
6490 return i;
6491
6492 return target;
6493}
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533static inline unsigned long cpu_util(int cpu)
6534{
6535 struct cfs_rq *cfs_rq;
6536 unsigned int util;
6537
6538 cfs_rq = &cpu_rq(cpu)->cfs;
6539 util = READ_ONCE(cfs_rq->avg.util_avg);
6540
6541 if (sched_feat(UTIL_EST))
6542 util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
6543
6544 return min_t(unsigned long, util, capacity_orig_of(cpu));
6545}
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560static unsigned long cpu_util_without(int cpu, struct task_struct *p)
6561{
6562 struct cfs_rq *cfs_rq;
6563 unsigned int util;
6564
6565
6566 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
6567 return cpu_util(cpu);
6568
6569 cfs_rq = &cpu_rq(cpu)->cfs;
6570 util = READ_ONCE(cfs_rq->avg.util_avg);
6571
6572
6573 lsub_positive(&util, task_util(p));
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601 if (sched_feat(UTIL_EST)) {
6602 unsigned int estimated =
6603 READ_ONCE(cfs_rq->avg.util_est.enqueued);
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622 if (unlikely(task_on_rq_queued(p) || current == p))
6623 lsub_positive(&estimated, _task_util_est(p));
6624
6625 util = max(util, estimated);
6626 }
6627
6628
6629
6630
6631
6632
6633 return min_t(unsigned long, util, capacity_orig_of(cpu));
6634}
6635
6636
6637
6638
6639
6640static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
6641{
6642 struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
6643 unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
6644
6645
6646
6647
6648
6649
6650
6651 if (task_cpu(p) == cpu && dst_cpu != cpu)
6652 lsub_positive(&util, task_util(p));
6653 else if (task_cpu(p) != cpu && dst_cpu == cpu)
6654 util += task_util(p);
6655
6656 if (sched_feat(UTIL_EST)) {
6657 util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
6658
6659
6660
6661
6662
6663
6664
6665 if (dst_cpu == cpu)
6666 util_est += _task_util_est(p);
6667
6668 util = max(util, util_est);
6669 }
6670
6671 return min(util, capacity_orig_of(cpu));
6672}
6673
6674
6675
6676
6677
6678
6679
6680
6681static long
6682compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
6683{
6684 struct cpumask *pd_mask = perf_domain_span(pd);
6685 unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
6686 unsigned long max_util = 0, sum_util = 0;
6687 unsigned long _cpu_cap = cpu_cap;
6688 int cpu;
6689
6690 _cpu_cap -= arch_scale_thermal_pressure(cpumask_first(pd_mask));
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701 for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
6702 unsigned long util_freq = cpu_util_next(cpu, p, dst_cpu);
6703 unsigned long cpu_util, util_running = util_freq;
6704 struct task_struct *tsk = NULL;
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715 if (cpu == dst_cpu) {
6716 tsk = p;
6717 util_running =
6718 cpu_util_next(cpu, p, -1) + task_util_est(p);
6719 }
6720
6721
6722
6723
6724
6725
6726
6727 cpu_util = effective_cpu_util(cpu, util_running, cpu_cap,
6728 ENERGY_UTIL, NULL);
6729
6730 sum_util += min(cpu_util, _cpu_cap);
6731
6732
6733
6734
6735
6736
6737
6738
6739 cpu_util = effective_cpu_util(cpu, util_freq, cpu_cap,
6740 FREQUENCY_UTIL, tsk);
6741 max_util = max(max_util, min(cpu_util, _cpu_cap));
6742 }
6743
6744 return em_cpu_energy(pd->em_pd, max_util, sum_util, _cpu_cap);
6745}
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
6787{
6788 unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
6789 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
6790 int cpu, best_energy_cpu = prev_cpu, target = -1;
6791 unsigned long cpu_cap, util, base_energy = 0;
6792 struct sched_domain *sd;
6793 struct perf_domain *pd;
6794
6795 rcu_read_lock();
6796 pd = rcu_dereference(rd->pd);
6797 if (!pd || READ_ONCE(rd->overutilized))
6798 goto unlock;
6799
6800
6801
6802
6803
6804 sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
6805 while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
6806 sd = sd->parent;
6807 if (!sd)
6808 goto unlock;
6809
6810 target = prev_cpu;
6811
6812 sync_entity_load_avg(&p->se);
6813 if (!task_util_est(p))
6814 goto unlock;
6815
6816 for (; pd; pd = pd->next) {
6817 unsigned long cur_delta, spare_cap, max_spare_cap = 0;
6818 bool compute_prev_delta = false;
6819 unsigned long base_energy_pd;
6820 int max_spare_cap_cpu = -1;
6821
6822 for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
6823 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
6824 continue;
6825
6826 util = cpu_util_next(cpu, p, cpu);
6827 cpu_cap = capacity_of(cpu);
6828 spare_cap = cpu_cap;
6829 lsub_positive(&spare_cap, util);
6830
6831
6832
6833
6834
6835
6836
6837
6838 util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
6839 if (!fits_capacity(util, cpu_cap))
6840 continue;
6841
6842 if (cpu == prev_cpu) {
6843
6844 compute_prev_delta = true;
6845 } else if (spare_cap > max_spare_cap) {
6846
6847
6848
6849
6850 max_spare_cap = spare_cap;
6851 max_spare_cap_cpu = cpu;
6852 }
6853 }
6854
6855 if (max_spare_cap_cpu < 0 && !compute_prev_delta)
6856 continue;
6857
6858
6859 base_energy_pd = compute_energy(p, -1, pd);
6860 base_energy += base_energy_pd;
6861
6862
6863 if (compute_prev_delta) {
6864 prev_delta = compute_energy(p, prev_cpu, pd);
6865 if (prev_delta < base_energy_pd)
6866 goto unlock;
6867 prev_delta -= base_energy_pd;
6868 best_delta = min(best_delta, prev_delta);
6869 }
6870
6871
6872 if (max_spare_cap_cpu >= 0) {
6873 cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
6874 if (cur_delta < base_energy_pd)
6875 goto unlock;
6876 cur_delta -= base_energy_pd;
6877 if (cur_delta < best_delta) {
6878 best_delta = cur_delta;
6879 best_energy_cpu = max_spare_cap_cpu;
6880 }
6881 }
6882 }
6883 rcu_read_unlock();
6884
6885
6886
6887
6888
6889 if ((prev_delta == ULONG_MAX) ||
6890 (prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
6891 target = best_energy_cpu;
6892
6893 return target;
6894
6895unlock:
6896 rcu_read_unlock();
6897
6898 return target;
6899}
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911static int
6912select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
6913{
6914 int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
6915 struct sched_domain *tmp, *sd = NULL;
6916 int cpu = smp_processor_id();
6917 int new_cpu = prev_cpu;
6918 int want_affine = 0;
6919
6920 int sd_flag = wake_flags & 0xF;
6921
6922
6923
6924
6925 lockdep_assert_held(&p->pi_lock);
6926 if (wake_flags & WF_TTWU) {
6927 record_wakee(p);
6928
6929 if (sched_energy_enabled()) {
6930 new_cpu = find_energy_efficient_cpu(p, prev_cpu);
6931 if (new_cpu >= 0)
6932 return new_cpu;
6933 new_cpu = prev_cpu;
6934 }
6935
6936 want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
6937 }
6938
6939 rcu_read_lock();
6940 for_each_domain(cpu, tmp) {
6941
6942
6943
6944
6945 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
6946 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
6947 if (cpu != prev_cpu)
6948 new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
6949
6950 sd = NULL;
6951 break;
6952 }
6953
6954 if (tmp->flags & sd_flag)
6955 sd = tmp;
6956 else if (!want_affine)
6957 break;
6958 }
6959
6960 if (unlikely(sd)) {
6961
6962 new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
6963 } else if (wake_flags & WF_TTWU) {
6964
6965 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
6966 }
6967 rcu_read_unlock();
6968
6969 return new_cpu;
6970}
6971
6972static void detach_entity_cfs_rq(struct sched_entity *se);
6973
6974
6975
6976
6977
6978
6979static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
6980{
6981
6982
6983
6984
6985
6986
6987 if (READ_ONCE(p->__state) == TASK_WAKING) {
6988 struct sched_entity *se = &p->se;
6989 struct cfs_rq *cfs_rq = cfs_rq_of(se);
6990 u64 min_vruntime;
6991
6992#ifndef CONFIG_64BIT
6993 u64 min_vruntime_copy;
6994
6995 do {
6996 min_vruntime_copy = cfs_rq->min_vruntime_copy;
6997 smp_rmb();
6998 min_vruntime = cfs_rq->min_vruntime;
6999 } while (min_vruntime != min_vruntime_copy);
7000#else
7001 min_vruntime = cfs_rq->min_vruntime;
7002#endif
7003
7004 se->vruntime -= min_vruntime;
7005 }
7006
7007 if (p->on_rq == TASK_ON_RQ_MIGRATING) {
7008
7009
7010
7011
7012 lockdep_assert_rq_held(task_rq(p));
7013 detach_entity_cfs_rq(&p->se);
7014
7015 } else {
7016
7017
7018
7019
7020
7021
7022
7023
7024 remove_entity_load_avg(&p->se);
7025 }
7026
7027
7028 p->se.avg.last_update_time = 0;
7029
7030
7031 p->se.exec_start = 0;
7032
7033 update_scan_period(p, new_cpu);
7034}
7035
7036static void task_dead_fair(struct task_struct *p)
7037{
7038 remove_entity_load_avg(&p->se);
7039}
7040
7041static int
7042balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
7043{
7044 if (rq->nr_running)
7045 return 1;
7046
7047 return newidle_balance(rq, rf) != 0;
7048}
7049#endif
7050
7051static unsigned long wakeup_gran(struct sched_entity *se)
7052{
7053 unsigned long gran = sysctl_sched_wakeup_granularity;
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068 return calc_delta_fair(gran, se);
7069}
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085static int
7086wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
7087{
7088 s64 gran, vdiff = curr->vruntime - se->vruntime;
7089
7090 if (vdiff <= 0)
7091 return -1;
7092
7093 gran = wakeup_gran(se);
7094 if (vdiff > gran)
7095 return 1;
7096
7097 return 0;
7098}
7099
7100static void set_last_buddy(struct sched_entity *se)
7101{
7102 for_each_sched_entity(se) {
7103 if (SCHED_WARN_ON(!se->on_rq))
7104 return;
7105 if (se_is_idle(se))
7106 return;
7107 cfs_rq_of(se)->last = se;
7108 }
7109}
7110
7111static void set_next_buddy(struct sched_entity *se)
7112{
7113 for_each_sched_entity(se) {
7114 if (SCHED_WARN_ON(!se->on_rq))
7115 return;
7116 if (se_is_idle(se))
7117 return;
7118 cfs_rq_of(se)->next = se;
7119 }
7120}
7121
7122static void set_skip_buddy(struct sched_entity *se)
7123{
7124 for_each_sched_entity(se)
7125 cfs_rq_of(se)->skip = se;
7126}
7127
7128
7129
7130
7131static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
7132{
7133 struct task_struct *curr = rq->curr;
7134 struct sched_entity *se = &curr->se, *pse = &p->se;
7135 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
7136 int scale = cfs_rq->nr_running >= sched_nr_latency;
7137 int next_buddy_marked = 0;
7138 int cse_is_idle, pse_is_idle;
7139
7140 if (unlikely(se == pse))
7141 return;
7142
7143
7144
7145
7146
7147
7148
7149 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
7150 return;
7151
7152 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
7153 set_next_buddy(pse);
7154 next_buddy_marked = 1;
7155 }
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167 if (test_tsk_need_resched(curr))
7168 return;
7169
7170
7171 if (unlikely(task_has_idle_policy(curr)) &&
7172 likely(!task_has_idle_policy(p)))
7173 goto preempt;
7174
7175
7176
7177
7178
7179 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
7180 return;
7181
7182 find_matching_se(&se, &pse);
7183 BUG_ON(!pse);
7184
7185 cse_is_idle = se_is_idle(se);
7186 pse_is_idle = se_is_idle(pse);
7187
7188
7189
7190
7191
7192 if (cse_is_idle && !pse_is_idle)
7193 goto preempt;
7194 if (cse_is_idle != pse_is_idle)
7195 return;
7196
7197 update_curr(cfs_rq_of(se));
7198 if (wakeup_preempt_entity(se, pse) == 1) {
7199
7200
7201
7202
7203 if (!next_buddy_marked)
7204 set_next_buddy(pse);
7205 goto preempt;
7206 }
7207
7208 return;
7209
7210preempt:
7211 resched_curr(rq);
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221 if (unlikely(!se->on_rq || curr == rq->idle))
7222 return;
7223
7224 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
7225 set_last_buddy(se);
7226}
7227
7228#ifdef CONFIG_SMP
7229static struct task_struct *pick_task_fair(struct rq *rq)
7230{
7231 struct sched_entity *se;
7232 struct cfs_rq *cfs_rq;
7233
7234again:
7235 cfs_rq = &rq->cfs;
7236 if (!cfs_rq->nr_running)
7237 return NULL;
7238
7239 do {
7240 struct sched_entity *curr = cfs_rq->curr;
7241
7242
7243 if (curr) {
7244 if (curr->on_rq)
7245 update_curr(cfs_rq);
7246 else
7247 curr = NULL;
7248
7249 if (unlikely(check_cfs_rq_runtime(cfs_rq)))
7250 goto again;
7251 }
7252
7253 se = pick_next_entity(cfs_rq, curr);
7254 cfs_rq = group_cfs_rq(se);
7255 } while (cfs_rq);
7256
7257 return task_of(se);
7258}
7259#endif
7260
7261struct task_struct *
7262pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
7263{
7264 struct cfs_rq *cfs_rq = &rq->cfs;
7265 struct sched_entity *se;
7266 struct task_struct *p;
7267 int new_tasks;
7268
7269again:
7270 if (!sched_fair_runnable(rq))
7271 goto idle;
7272
7273#ifdef CONFIG_FAIR_GROUP_SCHED
7274 if (!prev || prev->sched_class != &fair_sched_class)
7275 goto simple;
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285 do {
7286 struct sched_entity *curr = cfs_rq->curr;
7287
7288
7289
7290
7291
7292
7293
7294 if (curr) {
7295 if (curr->on_rq)
7296 update_curr(cfs_rq);
7297 else
7298 curr = NULL;
7299
7300
7301
7302
7303
7304
7305
7306 if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
7307 cfs_rq = &rq->cfs;
7308
7309 if (!cfs_rq->nr_running)
7310 goto idle;
7311
7312 goto simple;
7313 }
7314 }
7315
7316 se = pick_next_entity(cfs_rq, curr);
7317 cfs_rq = group_cfs_rq(se);
7318 } while (cfs_rq);
7319
7320 p = task_of(se);
7321
7322
7323
7324
7325
7326
7327 if (prev != p) {
7328 struct sched_entity *pse = &prev->se;
7329
7330 while (!(cfs_rq = is_same_group(se, pse))) {
7331 int se_depth = se->depth;
7332 int pse_depth = pse->depth;
7333
7334 if (se_depth <= pse_depth) {
7335 put_prev_entity(cfs_rq_of(pse), pse);
7336 pse = parent_entity(pse);
7337 }
7338 if (se_depth >= pse_depth) {
7339 set_next_entity(cfs_rq_of(se), se);
7340 se = parent_entity(se);
7341 }
7342 }
7343
7344 put_prev_entity(cfs_rq, pse);
7345 set_next_entity(cfs_rq, se);
7346 }
7347
7348 goto done;
7349simple:
7350#endif
7351 if (prev)
7352 put_prev_task(rq, prev);
7353
7354 do {
7355 se = pick_next_entity(cfs_rq, NULL);
7356 set_next_entity(cfs_rq, se);
7357 cfs_rq = group_cfs_rq(se);
7358 } while (cfs_rq);
7359
7360 p = task_of(se);
7361
7362done: __maybe_unused;
7363#ifdef CONFIG_SMP
7364
7365
7366
7367
7368
7369 list_move(&p->se.group_node, &rq->cfs_tasks);
7370#endif
7371
7372 if (hrtick_enabled_fair(rq))
7373 hrtick_start_fair(rq, p);
7374
7375 update_misfit_status(p, rq);
7376
7377 return p;
7378
7379idle:
7380 if (!rf)
7381 return NULL;
7382
7383 new_tasks = newidle_balance(rq, rf);
7384
7385
7386
7387
7388
7389
7390 if (new_tasks < 0)
7391 return RETRY_TASK;
7392
7393 if (new_tasks > 0)
7394 goto again;
7395
7396
7397
7398
7399
7400 update_idle_rq_clock_pelt(rq);
7401
7402 return NULL;
7403}
7404
7405static struct task_struct *__pick_next_task_fair(struct rq *rq)
7406{
7407 return pick_next_task_fair(rq, NULL, NULL);
7408}
7409
7410
7411
7412
7413static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
7414{
7415 struct sched_entity *se = &prev->se;
7416 struct cfs_rq *cfs_rq;
7417
7418 for_each_sched_entity(se) {
7419 cfs_rq = cfs_rq_of(se);
7420 put_prev_entity(cfs_rq, se);
7421 }
7422}
7423
7424
7425
7426
7427
7428
7429static void yield_task_fair(struct rq *rq)
7430{
7431 struct task_struct *curr = rq->curr;
7432 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
7433 struct sched_entity *se = &curr->se;
7434
7435
7436
7437
7438 if (unlikely(rq->nr_running == 1))
7439 return;
7440
7441 clear_buddies(cfs_rq, se);
7442
7443 if (curr->policy != SCHED_BATCH) {
7444 update_rq_clock(rq);
7445
7446
7447
7448 update_curr(cfs_rq);
7449
7450
7451
7452
7453
7454 rq_clock_skip_update(rq);
7455 }
7456
7457 set_skip_buddy(se);
7458}
7459
7460static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
7461{
7462 struct sched_entity *se = &p->se;
7463
7464
7465 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
7466 return false;
7467
7468
7469 set_next_buddy(se);
7470
7471 yield_task_fair(rq);
7472
7473 return true;
7474}
7475
7476#ifdef CONFIG_SMP
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595static unsigned long __read_mostly max_load_balance_interval = HZ/10;
7596
7597enum fbq_type { regular, remote, all };
7598
7599
7600
7601
7602
7603
7604
7605
7606enum group_type {
7607
7608 group_has_spare = 0,
7609
7610
7611
7612
7613 group_fully_busy,
7614
7615
7616
7617
7618 group_misfit_task,
7619
7620
7621
7622
7623
7624 group_asym_packing,
7625
7626
7627
7628
7629 group_imbalanced,
7630
7631
7632
7633
7634 group_overloaded
7635};
7636
7637enum migration_type {
7638 migrate_load = 0,
7639 migrate_util,
7640 migrate_task,
7641 migrate_misfit
7642};
7643
7644#define LBF_ALL_PINNED 0x01
7645#define LBF_NEED_BREAK 0x02
7646#define LBF_DST_PINNED 0x04
7647#define LBF_SOME_PINNED 0x08
7648#define LBF_ACTIVE_LB 0x10
7649
7650struct lb_env {
7651 struct sched_domain *sd;
7652
7653 struct rq *src_rq;
7654 int src_cpu;
7655
7656 int dst_cpu;
7657 struct rq *dst_rq;
7658
7659 struct cpumask *dst_grpmask;
7660 int new_dst_cpu;
7661 enum cpu_idle_type idle;
7662 long imbalance;
7663
7664 struct cpumask *cpus;
7665
7666 unsigned int flags;
7667
7668 unsigned int loop;
7669 unsigned int loop_break;
7670 unsigned int loop_max;
7671
7672 enum fbq_type fbq_type;
7673 enum migration_type migration_type;
7674 struct list_head tasks;
7675};
7676
7677
7678
7679
7680static int task_hot(struct task_struct *p, struct lb_env *env)
7681{
7682 s64 delta;
7683
7684 lockdep_assert_rq_held(env->src_rq);
7685
7686 if (p->sched_class != &fair_sched_class)
7687 return 0;
7688
7689 if (unlikely(task_has_idle_policy(p)))
7690 return 0;
7691
7692
7693 if (env->sd->flags & SD_SHARE_CPUCAPACITY)
7694 return 0;
7695
7696
7697
7698
7699 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
7700 (&p->se == cfs_rq_of(&p->se)->next ||
7701 &p->se == cfs_rq_of(&p->se)->last))
7702 return 1;
7703
7704 if (sysctl_sched_migration_cost == -1)
7705 return 1;
7706
7707
7708
7709
7710
7711 if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p))
7712 return 1;
7713
7714 if (sysctl_sched_migration_cost == 0)
7715 return 0;
7716
7717 delta = rq_clock_task(env->src_rq) - p->se.exec_start;
7718
7719 return delta < (s64)sysctl_sched_migration_cost;
7720}
7721
7722#ifdef CONFIG_NUMA_BALANCING
7723
7724
7725
7726
7727
7728static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
7729{
7730 struct numa_group *numa_group = rcu_dereference(p->numa_group);
7731 unsigned long src_weight, dst_weight;
7732 int src_nid, dst_nid, dist;
7733
7734 if (!static_branch_likely(&sched_numa_balancing))
7735 return -1;
7736
7737 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
7738 return -1;
7739
7740 src_nid = cpu_to_node(env->src_cpu);
7741 dst_nid = cpu_to_node(env->dst_cpu);
7742
7743 if (src_nid == dst_nid)
7744 return -1;
7745
7746
7747 if (src_nid == p->numa_preferred_nid) {
7748 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
7749 return 1;
7750 else
7751 return -1;
7752 }
7753
7754
7755 if (dst_nid == p->numa_preferred_nid)
7756 return 0;
7757
7758
7759 if (env->idle == CPU_IDLE)
7760 return -1;
7761
7762 dist = node_distance(src_nid, dst_nid);
7763 if (numa_group) {
7764 src_weight = group_weight(p, src_nid, dist);
7765 dst_weight = group_weight(p, dst_nid, dist);
7766 } else {
7767 src_weight = task_weight(p, src_nid, dist);
7768 dst_weight = task_weight(p, dst_nid, dist);
7769 }
7770
7771 return dst_weight < src_weight;
7772}
7773
7774#else
7775static inline int migrate_degrades_locality(struct task_struct *p,
7776 struct lb_env *env)
7777{
7778 return -1;
7779}
7780#endif
7781
7782
7783
7784
7785static
7786int can_migrate_task(struct task_struct *p, struct lb_env *env)
7787{
7788 int tsk_cache_hot;
7789
7790 lockdep_assert_rq_held(env->src_rq);
7791
7792
7793
7794
7795
7796
7797
7798
7799 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
7800 return 0;
7801
7802
7803 if (kthread_is_per_cpu(p))
7804 return 0;
7805
7806 if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
7807 int cpu;
7808
7809 schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
7810
7811 env->flags |= LBF_SOME_PINNED;
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823 if (env->idle == CPU_NEWLY_IDLE ||
7824 env->flags & (LBF_DST_PINNED | LBF_ACTIVE_LB))
7825 return 0;
7826
7827
7828 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
7829 if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
7830 env->flags |= LBF_DST_PINNED;
7831 env->new_dst_cpu = cpu;
7832 break;
7833 }
7834 }
7835
7836 return 0;
7837 }
7838
7839
7840 env->flags &= ~LBF_ALL_PINNED;
7841
7842 if (task_running(env->src_rq, p)) {
7843 schedstat_inc(p->se.statistics.nr_failed_migrations_running);
7844 return 0;
7845 }
7846
7847
7848
7849
7850
7851
7852
7853
7854 if (env->flags & LBF_ACTIVE_LB)
7855 return 1;
7856
7857 tsk_cache_hot = migrate_degrades_locality(p, env);
7858 if (tsk_cache_hot == -1)
7859 tsk_cache_hot = task_hot(p, env);
7860
7861 if (tsk_cache_hot <= 0 ||
7862 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
7863 if (tsk_cache_hot == 1) {
7864 schedstat_inc(env->sd->lb_hot_gained[env->idle]);
7865 schedstat_inc(p->se.statistics.nr_forced_migrations);
7866 }
7867 return 1;
7868 }
7869
7870 schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
7871 return 0;
7872}
7873
7874
7875
7876
7877static void detach_task(struct task_struct *p, struct lb_env *env)
7878{
7879 lockdep_assert_rq_held(env->src_rq);
7880
7881 deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
7882 set_task_cpu(p, env->dst_cpu);
7883}
7884
7885
7886
7887
7888
7889
7890
7891static struct task_struct *detach_one_task(struct lb_env *env)
7892{
7893 struct task_struct *p;
7894
7895 lockdep_assert_rq_held(env->src_rq);
7896
7897 list_for_each_entry_reverse(p,
7898 &env->src_rq->cfs_tasks, se.group_node) {
7899 if (!can_migrate_task(p, env))
7900 continue;
7901
7902 detach_task(p, env);
7903
7904
7905
7906
7907
7908
7909
7910 schedstat_inc(env->sd->lb_gained[env->idle]);
7911 return p;
7912 }
7913 return NULL;
7914}
7915
7916static const unsigned int sched_nr_migrate_break = 32;
7917
7918
7919
7920
7921
7922
7923
7924static int detach_tasks(struct lb_env *env)
7925{
7926 struct list_head *tasks = &env->src_rq->cfs_tasks;
7927 unsigned long util, load;
7928 struct task_struct *p;
7929 int detached = 0;
7930
7931 lockdep_assert_rq_held(env->src_rq);
7932
7933
7934
7935
7936
7937 if (env->src_rq->nr_running <= 1) {
7938 env->flags &= ~LBF_ALL_PINNED;
7939 return 0;
7940 }
7941
7942 if (env->imbalance <= 0)
7943 return 0;
7944
7945 while (!list_empty(tasks)) {
7946
7947
7948
7949
7950 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
7951 break;
7952
7953 p = list_last_entry(tasks, struct task_struct, se.group_node);
7954
7955 env->loop++;
7956
7957 if (env->loop > env->loop_max)
7958 break;
7959
7960
7961 if (env->loop > env->loop_break) {
7962 env->loop_break += sched_nr_migrate_break;
7963 env->flags |= LBF_NEED_BREAK;
7964 break;
7965 }
7966
7967 if (!can_migrate_task(p, env))
7968 goto next;
7969
7970 switch (env->migration_type) {
7971 case migrate_load:
7972
7973
7974
7975
7976
7977
7978
7979 load = max_t(unsigned long, task_h_load(p), 1);
7980
7981 if (sched_feat(LB_MIN) &&
7982 load < 16 && !env->sd->nr_balance_failed)
7983 goto next;
7984
7985
7986
7987
7988
7989
7990
7991 if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance)
7992 goto next;
7993
7994 env->imbalance -= load;
7995 break;
7996
7997 case migrate_util:
7998 util = task_util_est(p);
7999
8000 if (util > env->imbalance)
8001 goto next;
8002
8003 env->imbalance -= util;
8004 break;
8005
8006 case migrate_task:
8007 env->imbalance--;
8008 break;
8009
8010 case migrate_misfit:
8011
8012 if (task_fits_capacity(p, capacity_of(env->src_cpu)))
8013 goto next;
8014
8015 env->imbalance = 0;
8016 break;
8017 }
8018
8019 detach_task(p, env);
8020 list_add(&p->se.group_node, &env->tasks);
8021
8022 detached++;
8023
8024#ifdef CONFIG_PREEMPTION
8025
8026
8027
8028
8029
8030 if (env->idle == CPU_NEWLY_IDLE)
8031 break;
8032#endif
8033
8034
8035
8036
8037
8038 if (env->imbalance <= 0)
8039 break;
8040
8041 continue;
8042next:
8043 list_move(&p->se.group_node, tasks);
8044 }
8045
8046
8047
8048
8049
8050
8051 schedstat_add(env->sd->lb_gained[env->idle], detached);
8052
8053 return detached;
8054}
8055
8056
8057
8058
8059static void attach_task(struct rq *rq, struct task_struct *p)
8060{
8061 lockdep_assert_rq_held(rq);
8062
8063 BUG_ON(task_rq(p) != rq);
8064 activate_task(rq, p, ENQUEUE_NOCLOCK);
8065 check_preempt_curr(rq, p, 0);
8066}
8067
8068
8069
8070
8071
8072static void attach_one_task(struct rq *rq, struct task_struct *p)
8073{
8074 struct rq_flags rf;
8075
8076 rq_lock(rq, &rf);
8077 update_rq_clock(rq);
8078 attach_task(rq, p);
8079 rq_unlock(rq, &rf);
8080}
8081
8082
8083
8084
8085
8086static void attach_tasks(struct lb_env *env)
8087{
8088 struct list_head *tasks = &env->tasks;
8089 struct task_struct *p;
8090 struct rq_flags rf;
8091
8092 rq_lock(env->dst_rq, &rf);
8093 update_rq_clock(env->dst_rq);
8094
8095 while (!list_empty(tasks)) {
8096 p = list_first_entry(tasks, struct task_struct, se.group_node);
8097 list_del_init(&p->se.group_node);
8098
8099 attach_task(env->dst_rq, p);
8100 }
8101
8102 rq_unlock(env->dst_rq, &rf);
8103}
8104
8105#ifdef CONFIG_NO_HZ_COMMON
8106static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
8107{
8108 if (cfs_rq->avg.load_avg)
8109 return true;
8110
8111 if (cfs_rq->avg.util_avg)
8112 return true;
8113
8114 return false;
8115}
8116
8117static inline bool others_have_blocked(struct rq *rq)
8118{
8119 if (READ_ONCE(rq->avg_rt.util_avg))
8120 return true;
8121
8122 if (READ_ONCE(rq->avg_dl.util_avg))
8123 return true;
8124
8125 if (thermal_load_avg(rq))
8126 return true;
8127
8128#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
8129 if (READ_ONCE(rq->avg_irq.util_avg))
8130 return true;
8131#endif
8132
8133 return false;
8134}
8135
8136static inline void update_blocked_load_tick(struct rq *rq)
8137{
8138 WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
8139}
8140
8141static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
8142{
8143 if (!has_blocked)
8144 rq->has_blocked_load = 0;
8145}
8146#else
8147static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
8148static inline bool others_have_blocked(struct rq *rq) { return false; }
8149static inline void update_blocked_load_tick(struct rq *rq) {}
8150static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
8151#endif
8152
8153static bool __update_blocked_others(struct rq *rq, bool *done)
8154{
8155 const struct sched_class *curr_class;
8156 u64 now = rq_clock_pelt(rq);
8157 unsigned long thermal_pressure;
8158 bool decayed;
8159
8160
8161
8162
8163
8164 curr_class = rq->curr->sched_class;
8165
8166 thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
8167
8168 decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
8169 update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
8170 update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) |
8171 update_irq_load_avg(rq, 0);
8172
8173 if (others_have_blocked(rq))
8174 *done = false;
8175
8176 return decayed;
8177}
8178
8179#ifdef CONFIG_FAIR_GROUP_SCHED
8180
8181static bool __update_blocked_fair(struct rq *rq, bool *done)
8182{
8183 struct cfs_rq *cfs_rq, *pos;
8184 bool decayed = false;
8185 int cpu = cpu_of(rq);
8186
8187
8188
8189
8190
8191 for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
8192 struct sched_entity *se;
8193
8194 if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
8195 update_tg_load_avg(cfs_rq);
8196
8197 if (cfs_rq == &rq->cfs)
8198 decayed = true;
8199 }
8200
8201
8202 se = cfs_rq->tg->se[cpu];
8203 if (se && !skip_blocked_update(se))
8204 update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
8205
8206
8207
8208
8209
8210 if (cfs_rq_is_decayed(cfs_rq))
8211 list_del_leaf_cfs_rq(cfs_rq);
8212
8213
8214 if (cfs_rq_has_blocked(cfs_rq))
8215 *done = false;
8216 }
8217
8218 return decayed;
8219}
8220
8221
8222
8223
8224
8225
8226static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
8227{
8228 struct rq *rq = rq_of(cfs_rq);
8229 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
8230 unsigned long now = jiffies;
8231 unsigned long load;
8232
8233 if (cfs_rq->last_h_load_update == now)
8234 return;
8235
8236 WRITE_ONCE(cfs_rq->h_load_next, NULL);
8237 for_each_sched_entity(se) {
8238 cfs_rq = cfs_rq_of(se);
8239 WRITE_ONCE(cfs_rq->h_load_next, se);
8240 if (cfs_rq->last_h_load_update == now)
8241 break;
8242 }
8243
8244 if (!se) {
8245 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
8246 cfs_rq->last_h_load_update = now;
8247 }
8248
8249 while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
8250 load = cfs_rq->h_load;
8251 load = div64_ul(load * se->avg.load_avg,
8252 cfs_rq_load_avg(cfs_rq) + 1);
8253 cfs_rq = group_cfs_rq(se);
8254 cfs_rq->h_load = load;
8255 cfs_rq->last_h_load_update = now;
8256 }
8257}
8258
8259static unsigned long task_h_load(struct task_struct *p)
8260{
8261 struct cfs_rq *cfs_rq = task_cfs_rq(p);
8262
8263 update_cfs_rq_h_load(cfs_rq);
8264 return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
8265 cfs_rq_load_avg(cfs_rq) + 1);
8266}
8267#else
8268static bool __update_blocked_fair(struct rq *rq, bool *done)
8269{
8270 struct cfs_rq *cfs_rq = &rq->cfs;
8271 bool decayed;
8272
8273 decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
8274 if (cfs_rq_has_blocked(cfs_rq))
8275 *done = false;
8276
8277 return decayed;
8278}
8279
8280static unsigned long task_h_load(struct task_struct *p)
8281{
8282 return p->se.avg.load_avg;
8283}
8284#endif
8285
8286static void update_blocked_averages(int cpu)
8287{
8288 bool decayed = false, done = true;
8289 struct rq *rq = cpu_rq(cpu);
8290 struct rq_flags rf;
8291
8292 rq_lock_irqsave(rq, &rf);
8293 update_blocked_load_tick(rq);
8294 update_rq_clock(rq);
8295
8296 decayed |= __update_blocked_others(rq, &done);
8297 decayed |= __update_blocked_fair(rq, &done);
8298
8299 update_blocked_load_status(rq, !done);
8300 if (decayed)
8301 cpufreq_update_util(rq, 0);
8302 rq_unlock_irqrestore(rq, &rf);
8303}
8304
8305
8306
8307
8308
8309
8310struct sg_lb_stats {
8311 unsigned long avg_load;
8312 unsigned long group_load;
8313 unsigned long group_capacity;
8314 unsigned long group_util;
8315 unsigned long group_runnable;
8316 unsigned int sum_nr_running;
8317 unsigned int sum_h_nr_running;
8318 unsigned int idle_cpus;
8319 unsigned int group_weight;
8320 enum group_type group_type;
8321 unsigned int group_asym_packing;
8322 unsigned long group_misfit_task_load;
8323#ifdef CONFIG_NUMA_BALANCING
8324 unsigned int nr_numa_running;
8325 unsigned int nr_preferred_running;
8326#endif
8327};
8328
8329
8330
8331
8332
8333struct sd_lb_stats {
8334 struct sched_group *busiest;
8335 struct sched_group *local;
8336 unsigned long total_load;
8337 unsigned long total_capacity;
8338 unsigned long avg_load;
8339 unsigned int prefer_sibling;
8340
8341 struct sg_lb_stats busiest_stat;
8342 struct sg_lb_stats local_stat;
8343};
8344
8345static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
8346{
8347
8348
8349
8350
8351
8352
8353
8354 *sds = (struct sd_lb_stats){
8355 .busiest = NULL,
8356 .local = NULL,
8357 .total_load = 0UL,
8358 .total_capacity = 0UL,
8359 .busiest_stat = {
8360 .idle_cpus = UINT_MAX,
8361 .group_type = group_has_spare,
8362 },
8363 };
8364}
8365
8366static unsigned long scale_rt_capacity(int cpu)
8367{
8368 struct rq *rq = cpu_rq(cpu);
8369 unsigned long max = arch_scale_cpu_capacity(cpu);
8370 unsigned long used, free;
8371 unsigned long irq;
8372
8373 irq = cpu_util_irq(rq);
8374
8375 if (unlikely(irq >= max))
8376 return 1;
8377
8378
8379
8380
8381
8382
8383
8384 used = READ_ONCE(rq->avg_rt.util_avg);
8385 used += READ_ONCE(rq->avg_dl.util_avg);
8386 used += thermal_load_avg(rq);
8387
8388 if (unlikely(used >= max))
8389 return 1;
8390
8391 free = max - used;
8392
8393 return scale_irq_capacity(free, irq, max);
8394}
8395
8396static void update_cpu_capacity(struct sched_domain *sd, int cpu)
8397{
8398 unsigned long capacity = scale_rt_capacity(cpu);
8399 struct sched_group *sdg = sd->groups;
8400
8401 cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
8402
8403 if (!capacity)
8404 capacity = 1;
8405
8406 cpu_rq(cpu)->cpu_capacity = capacity;
8407 trace_sched_cpu_capacity_tp(cpu_rq(cpu));
8408
8409 sdg->sgc->capacity = capacity;
8410 sdg->sgc->min_capacity = capacity;
8411 sdg->sgc->max_capacity = capacity;
8412}
8413
8414void update_group_capacity(struct sched_domain *sd, int cpu)
8415{
8416 struct sched_domain *child = sd->child;
8417 struct sched_group *group, *sdg = sd->groups;
8418 unsigned long capacity, min_capacity, max_capacity;
8419 unsigned long interval;
8420
8421 interval = msecs_to_jiffies(sd->balance_interval);
8422 interval = clamp(interval, 1UL, max_load_balance_interval);
8423 sdg->sgc->next_update = jiffies + interval;
8424
8425 if (!child) {
8426 update_cpu_capacity(sd, cpu);
8427 return;
8428 }
8429
8430 capacity = 0;
8431 min_capacity = ULONG_MAX;
8432 max_capacity = 0;
8433
8434 if (child->flags & SD_OVERLAP) {
8435
8436
8437
8438
8439
8440 for_each_cpu(cpu, sched_group_span(sdg)) {
8441 unsigned long cpu_cap = capacity_of(cpu);
8442
8443 capacity += cpu_cap;
8444 min_capacity = min(cpu_cap, min_capacity);
8445 max_capacity = max(cpu_cap, max_capacity);
8446 }
8447 } else {
8448
8449
8450
8451
8452
8453 group = child->groups;
8454 do {
8455 struct sched_group_capacity *sgc = group->sgc;
8456
8457 capacity += sgc->capacity;
8458 min_capacity = min(sgc->min_capacity, min_capacity);
8459 max_capacity = max(sgc->max_capacity, max_capacity);
8460 group = group->next;
8461 } while (group != child->groups);
8462 }
8463
8464 sdg->sgc->capacity = capacity;
8465 sdg->sgc->min_capacity = min_capacity;
8466 sdg->sgc->max_capacity = max_capacity;
8467}
8468
8469
8470
8471
8472
8473
8474static inline int
8475check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
8476{
8477 return ((rq->cpu_capacity * sd->imbalance_pct) <
8478 (rq->cpu_capacity_orig * 100));
8479}
8480
8481
8482
8483
8484
8485
8486static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
8487{
8488 return rq->misfit_task_load &&
8489 (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
8490 check_cpu_capacity(rq, sd));
8491}
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522static inline int sg_imbalanced(struct sched_group *group)
8523{
8524 return group->sgc->imbalance;
8525}
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539static inline bool
8540group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
8541{
8542 if (sgs->sum_nr_running < sgs->group_weight)
8543 return true;
8544
8545 if ((sgs->group_capacity * imbalance_pct) <
8546 (sgs->group_runnable * 100))
8547 return false;
8548
8549 if ((sgs->group_capacity * 100) >
8550 (sgs->group_util * imbalance_pct))
8551 return true;
8552
8553 return false;
8554}
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564static inline bool
8565group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
8566{
8567 if (sgs->sum_nr_running <= sgs->group_weight)
8568 return false;
8569
8570 if ((sgs->group_capacity * 100) <
8571 (sgs->group_util * imbalance_pct))
8572 return true;
8573
8574 if ((sgs->group_capacity * imbalance_pct) <
8575 (sgs->group_runnable * 100))
8576 return true;
8577
8578 return false;
8579}
8580
8581static inline enum
8582group_type group_classify(unsigned int imbalance_pct,
8583 struct sched_group *group,
8584 struct sg_lb_stats *sgs)
8585{
8586 if (group_is_overloaded(imbalance_pct, sgs))
8587 return group_overloaded;
8588
8589 if (sg_imbalanced(group))
8590 return group_imbalanced;
8591
8592 if (sgs->group_asym_packing)
8593 return group_asym_packing;
8594
8595 if (sgs->group_misfit_task_load)
8596 return group_misfit_task;
8597
8598 if (!group_has_capacity(imbalance_pct, sgs))
8599 return group_fully_busy;
8600
8601 return group_has_spare;
8602}
8603
8604
8605
8606
8607
8608
8609
8610
8611static inline void update_sg_lb_stats(struct lb_env *env,
8612 struct sched_group *group,
8613 struct sg_lb_stats *sgs,
8614 int *sg_status)
8615{
8616 int i, nr_running, local_group;
8617
8618 memset(sgs, 0, sizeof(*sgs));
8619
8620 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
8621
8622 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
8623 struct rq *rq = cpu_rq(i);
8624
8625 sgs->group_load += cpu_load(rq);
8626 sgs->group_util += cpu_util(i);
8627 sgs->group_runnable += cpu_runnable(rq);
8628 sgs->sum_h_nr_running += rq->cfs.h_nr_running;
8629
8630 nr_running = rq->nr_running;
8631 sgs->sum_nr_running += nr_running;
8632
8633 if (nr_running > 1)
8634 *sg_status |= SG_OVERLOAD;
8635
8636 if (cpu_overutilized(i))
8637 *sg_status |= SG_OVERUTILIZED;
8638
8639#ifdef CONFIG_NUMA_BALANCING
8640 sgs->nr_numa_running += rq->nr_numa_running;
8641 sgs->nr_preferred_running += rq->nr_preferred_running;
8642#endif
8643
8644
8645
8646 if (!nr_running && idle_cpu(i)) {
8647 sgs->idle_cpus++;
8648
8649 continue;
8650 }
8651
8652 if (local_group)
8653 continue;
8654
8655
8656 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
8657 sgs->group_misfit_task_load < rq->misfit_task_load) {
8658 sgs->group_misfit_task_load = rq->misfit_task_load;
8659 *sg_status |= SG_OVERLOAD;
8660 }
8661 }
8662
8663
8664 if (env->sd->flags & SD_ASYM_PACKING &&
8665 env->idle != CPU_NOT_IDLE &&
8666 sgs->sum_h_nr_running &&
8667 sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
8668 sgs->group_asym_packing = 1;
8669 }
8670
8671 sgs->group_capacity = group->sgc->capacity;
8672
8673 sgs->group_weight = group->group_weight;
8674
8675 sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
8676
8677
8678 if (sgs->group_type == group_overloaded)
8679 sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
8680 sgs->group_capacity;
8681}
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696static bool update_sd_pick_busiest(struct lb_env *env,
8697 struct sd_lb_stats *sds,
8698 struct sched_group *sg,
8699 struct sg_lb_stats *sgs)
8700{
8701 struct sg_lb_stats *busiest = &sds->busiest_stat;
8702
8703
8704 if (!sgs->sum_h_nr_running)
8705 return false;
8706
8707
8708
8709
8710
8711
8712
8713 if (sgs->group_type == group_misfit_task &&
8714 (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
8715 sds->local_stat.group_type != group_has_spare))
8716 return false;
8717
8718 if (sgs->group_type > busiest->group_type)
8719 return true;
8720
8721 if (sgs->group_type < busiest->group_type)
8722 return false;
8723
8724
8725
8726
8727
8728
8729 switch (sgs->group_type) {
8730 case group_overloaded:
8731
8732 if (sgs->avg_load <= busiest->avg_load)
8733 return false;
8734 break;
8735
8736 case group_imbalanced:
8737
8738
8739
8740
8741 return false;
8742
8743 case group_asym_packing:
8744
8745 if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
8746 return false;
8747 break;
8748
8749 case group_misfit_task:
8750
8751
8752
8753
8754 if (sgs->group_misfit_task_load < busiest->group_misfit_task_load)
8755 return false;
8756 break;
8757
8758 case group_fully_busy:
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769 if (sgs->avg_load <= busiest->avg_load)
8770 return false;
8771 break;
8772
8773 case group_has_spare:
8774
8775
8776
8777
8778
8779
8780
8781 if (sgs->idle_cpus > busiest->idle_cpus)
8782 return false;
8783 else if ((sgs->idle_cpus == busiest->idle_cpus) &&
8784 (sgs->sum_nr_running <= busiest->sum_nr_running))
8785 return false;
8786
8787 break;
8788 }
8789
8790
8791
8792
8793
8794
8795
8796 if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
8797 (sgs->group_type <= group_fully_busy) &&
8798 (capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu))))
8799 return false;
8800
8801 return true;
8802}
8803
8804#ifdef CONFIG_NUMA_BALANCING
8805static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
8806{
8807 if (sgs->sum_h_nr_running > sgs->nr_numa_running)
8808 return regular;
8809 if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
8810 return remote;
8811 return all;
8812}
8813
8814static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8815{
8816 if (rq->nr_running > rq->nr_numa_running)
8817 return regular;
8818 if (rq->nr_running > rq->nr_preferred_running)
8819 return remote;
8820 return all;
8821}
8822#else
8823static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
8824{
8825 return all;
8826}
8827
8828static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8829{
8830 return regular;
8831}
8832#endif
8833
8834
8835struct sg_lb_stats;
8836
8837
8838
8839
8840
8841static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
8842{
8843
8844 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
8845 return 0;
8846
8847 if (task_on_rq_queued(p))
8848 return 1;
8849
8850 return 0;
8851}
8852
8853
8854
8855
8856
8857
8858
8859
8860static int idle_cpu_without(int cpu, struct task_struct *p)
8861{
8862 struct rq *rq = cpu_rq(cpu);
8863
8864 if (rq->curr != rq->idle && rq->curr != p)
8865 return 0;
8866
8867
8868
8869
8870
8871
8872
8873#ifdef CONFIG_SMP
8874 if (rq->ttwu_pending)
8875 return 0;
8876#endif
8877
8878 return 1;
8879}
8880
8881
8882
8883
8884
8885
8886
8887
8888static inline void update_sg_wakeup_stats(struct sched_domain *sd,
8889 struct sched_group *group,
8890 struct sg_lb_stats *sgs,
8891 struct task_struct *p)
8892{
8893 int i, nr_running;
8894
8895 memset(sgs, 0, sizeof(*sgs));
8896
8897 for_each_cpu(i, sched_group_span(group)) {
8898 struct rq *rq = cpu_rq(i);
8899 unsigned int local;
8900
8901 sgs->group_load += cpu_load_without(rq, p);
8902 sgs->group_util += cpu_util_without(i, p);
8903 sgs->group_runnable += cpu_runnable_without(rq, p);
8904 local = task_running_on_cpu(i, p);
8905 sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
8906
8907 nr_running = rq->nr_running - local;
8908 sgs->sum_nr_running += nr_running;
8909
8910
8911
8912
8913 if (!nr_running && idle_cpu_without(i, p))
8914 sgs->idle_cpus++;
8915
8916 }
8917
8918
8919 if (sd->flags & SD_ASYM_CPUCAPACITY &&
8920 !task_fits_capacity(p, group->sgc->max_capacity)) {
8921 sgs->group_misfit_task_load = 1;
8922 }
8923
8924 sgs->group_capacity = group->sgc->capacity;
8925
8926 sgs->group_weight = group->group_weight;
8927
8928 sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
8929
8930
8931
8932
8933
8934 if (sgs->group_type == group_fully_busy ||
8935 sgs->group_type == group_overloaded)
8936 sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
8937 sgs->group_capacity;
8938}
8939
8940static bool update_pick_idlest(struct sched_group *idlest,
8941 struct sg_lb_stats *idlest_sgs,
8942 struct sched_group *group,
8943 struct sg_lb_stats *sgs)
8944{
8945 if (sgs->group_type < idlest_sgs->group_type)
8946 return true;
8947
8948 if (sgs->group_type > idlest_sgs->group_type)
8949 return false;
8950
8951
8952
8953
8954
8955
8956 switch (sgs->group_type) {
8957 case group_overloaded:
8958 case group_fully_busy:
8959
8960 if (idlest_sgs->avg_load <= sgs->avg_load)
8961 return false;
8962 break;
8963
8964 case group_imbalanced:
8965 case group_asym_packing:
8966
8967 return false;
8968
8969 case group_misfit_task:
8970
8971 if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
8972 return false;
8973 break;
8974
8975 case group_has_spare:
8976
8977 if (idlest_sgs->idle_cpus > sgs->idle_cpus)
8978 return false;
8979
8980
8981 if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
8982 idlest_sgs->group_util <= sgs->group_util)
8983 return false;
8984
8985 break;
8986 }
8987
8988 return true;
8989}
8990
8991
8992
8993
8994
8995
8996static inline bool allow_numa_imbalance(int dst_running, int dst_weight)
8997{
8998 return (dst_running < (dst_weight >> 2));
8999}
9000
9001
9002
9003
9004
9005
9006
9007static struct sched_group *
9008find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
9009{
9010 struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
9011 struct sg_lb_stats local_sgs, tmp_sgs;
9012 struct sg_lb_stats *sgs;
9013 unsigned long imbalance;
9014 struct sg_lb_stats idlest_sgs = {
9015 .avg_load = UINT_MAX,
9016 .group_type = group_overloaded,
9017 };
9018
9019 do {
9020 int local_group;
9021
9022
9023 if (!cpumask_intersects(sched_group_span(group),
9024 p->cpus_ptr))
9025 continue;
9026
9027
9028 if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group))
9029 continue;
9030
9031 local_group = cpumask_test_cpu(this_cpu,
9032 sched_group_span(group));
9033
9034 if (local_group) {
9035 sgs = &local_sgs;
9036 local = group;
9037 } else {
9038 sgs = &tmp_sgs;
9039 }
9040
9041 update_sg_wakeup_stats(sd, group, sgs, p);
9042
9043 if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
9044 idlest = group;
9045 idlest_sgs = *sgs;
9046 }
9047
9048 } while (group = group->next, group != sd->groups);
9049
9050
9051
9052 if (!idlest)
9053 return NULL;
9054
9055
9056 if (!local)
9057 return idlest;
9058
9059
9060
9061
9062
9063 if (local_sgs.group_type < idlest_sgs.group_type)
9064 return NULL;
9065
9066
9067
9068
9069
9070 if (local_sgs.group_type > idlest_sgs.group_type)
9071 return idlest;
9072
9073 switch (local_sgs.group_type) {
9074 case group_overloaded:
9075 case group_fully_busy:
9076
9077
9078 imbalance = scale_load_down(NICE_0_LOAD) *
9079 (sd->imbalance_pct-100) / 100;
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090 if ((sd->flags & SD_NUMA) &&
9091 ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
9092 return NULL;
9093
9094
9095
9096
9097
9098 if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
9099 return NULL;
9100
9101 if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
9102 return NULL;
9103 break;
9104
9105 case group_imbalanced:
9106 case group_asym_packing:
9107
9108 return NULL;
9109
9110 case group_misfit_task:
9111
9112 if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
9113 return NULL;
9114 break;
9115
9116 case group_has_spare:
9117 if (sd->flags & SD_NUMA) {
9118#ifdef CONFIG_NUMA_BALANCING
9119 int idlest_cpu;
9120
9121
9122
9123
9124 if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
9125 return NULL;
9126
9127 idlest_cpu = cpumask_first(sched_group_span(idlest));
9128 if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
9129 return idlest;
9130#endif
9131
9132
9133
9134
9135
9136
9137 if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight))
9138 return NULL;
9139 }
9140
9141
9142
9143
9144
9145
9146
9147 if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
9148 return NULL;
9149 break;
9150 }
9151
9152 return idlest;
9153}
9154
9155
9156
9157
9158
9159
9160
9161static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
9162{
9163 struct sched_domain *child = env->sd->child;
9164 struct sched_group *sg = env->sd->groups;
9165 struct sg_lb_stats *local = &sds->local_stat;
9166 struct sg_lb_stats tmp_sgs;
9167 int sg_status = 0;
9168
9169 do {
9170 struct sg_lb_stats *sgs = &tmp_sgs;
9171 int local_group;
9172
9173 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
9174 if (local_group) {
9175 sds->local = sg;
9176 sgs = local;
9177
9178 if (env->idle != CPU_NEWLY_IDLE ||
9179 time_after_eq(jiffies, sg->sgc->next_update))
9180 update_group_capacity(env->sd, env->dst_cpu);
9181 }
9182
9183 update_sg_lb_stats(env, sg, sgs, &sg_status);
9184
9185 if (local_group)
9186 goto next_group;
9187
9188
9189 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
9190 sds->busiest = sg;
9191 sds->busiest_stat = *sgs;
9192 }
9193
9194next_group:
9195
9196 sds->total_load += sgs->group_load;
9197 sds->total_capacity += sgs->group_capacity;
9198
9199 sg = sg->next;
9200 } while (sg != env->sd->groups);
9201
9202
9203 sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
9204
9205
9206 if (env->sd->flags & SD_NUMA)
9207 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
9208
9209 if (!env->sd->parent) {
9210 struct root_domain *rd = env->dst_rq->rd;
9211
9212
9213 WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
9214
9215
9216 WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
9217 trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
9218 } else if (sg_status & SG_OVERUTILIZED) {
9219 struct root_domain *rd = env->dst_rq->rd;
9220
9221 WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
9222 trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
9223 }
9224}
9225
9226#define NUMA_IMBALANCE_MIN 2
9227
9228static inline long adjust_numa_imbalance(int imbalance,
9229 int dst_running, int dst_weight)
9230{
9231 if (!allow_numa_imbalance(dst_running, dst_weight))
9232 return imbalance;
9233
9234
9235
9236
9237
9238 if (imbalance <= NUMA_IMBALANCE_MIN)
9239 return 0;
9240
9241 return imbalance;
9242}
9243
9244
9245
9246
9247
9248
9249
9250static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
9251{
9252 struct sg_lb_stats *local, *busiest;
9253
9254 local = &sds->local_stat;
9255 busiest = &sds->busiest_stat;
9256
9257 if (busiest->group_type == group_misfit_task) {
9258
9259 env->migration_type = migrate_misfit;
9260 env->imbalance = 1;
9261 return;
9262 }
9263
9264 if (busiest->group_type == group_asym_packing) {
9265
9266
9267
9268
9269 env->migration_type = migrate_task;
9270 env->imbalance = busiest->sum_h_nr_running;
9271 return;
9272 }
9273
9274 if (busiest->group_type == group_imbalanced) {
9275
9276
9277
9278
9279
9280
9281 env->migration_type = migrate_task;
9282 env->imbalance = 1;
9283 return;
9284 }
9285
9286
9287
9288
9289
9290 if (local->group_type == group_has_spare) {
9291 if ((busiest->group_type > group_fully_busy) &&
9292 !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) {
9293
9294
9295
9296
9297
9298
9299
9300
9301 env->migration_type = migrate_util;
9302 env->imbalance = max(local->group_capacity, local->group_util) -
9303 local->group_util;
9304
9305
9306
9307
9308
9309
9310
9311
9312 if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
9313 env->migration_type = migrate_task;
9314 env->imbalance = 1;
9315 }
9316
9317 return;
9318 }
9319
9320 if (busiest->group_weight == 1 || sds->prefer_sibling) {
9321 unsigned int nr_diff = busiest->sum_nr_running;
9322
9323
9324
9325
9326 env->migration_type = migrate_task;
9327 lsub_positive(&nr_diff, local->sum_nr_running);
9328 env->imbalance = nr_diff >> 1;
9329 } else {
9330
9331
9332
9333
9334
9335 env->migration_type = migrate_task;
9336 env->imbalance = max_t(long, 0, (local->idle_cpus -
9337 busiest->idle_cpus) >> 1);
9338 }
9339
9340
9341 if (env->sd->flags & SD_NUMA) {
9342 env->imbalance = adjust_numa_imbalance(env->imbalance,
9343 busiest->sum_nr_running, busiest->group_weight);
9344 }
9345
9346 return;
9347 }
9348
9349
9350
9351
9352
9353 if (local->group_type < group_overloaded) {
9354
9355
9356
9357
9358
9359 local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
9360 local->group_capacity;
9361
9362 sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
9363 sds->total_capacity;
9364
9365
9366
9367
9368 if (local->avg_load >= busiest->avg_load) {
9369 env->imbalance = 0;
9370 return;
9371 }
9372 }
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382 env->migration_type = migrate_load;
9383 env->imbalance = min(
9384 (busiest->avg_load - sds->avg_load) * busiest->group_capacity,
9385 (sds->avg_load - local->avg_load) * local->group_capacity
9386 ) / SCHED_CAPACITY_SCALE;
9387}
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422static struct sched_group *find_busiest_group(struct lb_env *env)
9423{
9424 struct sg_lb_stats *local, *busiest;
9425 struct sd_lb_stats sds;
9426
9427 init_sd_lb_stats(&sds);
9428
9429
9430
9431
9432
9433 update_sd_lb_stats(env, &sds);
9434
9435 if (sched_energy_enabled()) {
9436 struct root_domain *rd = env->dst_rq->rd;
9437
9438 if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
9439 goto out_balanced;
9440 }
9441
9442 local = &sds.local_stat;
9443 busiest = &sds.busiest_stat;
9444
9445
9446 if (!sds.busiest)
9447 goto out_balanced;
9448
9449
9450 if (busiest->group_type == group_misfit_task)
9451 goto force_balance;
9452
9453
9454 if (busiest->group_type == group_asym_packing)
9455 goto force_balance;
9456
9457
9458
9459
9460
9461
9462 if (busiest->group_type == group_imbalanced)
9463 goto force_balance;
9464
9465
9466
9467
9468
9469 if (local->group_type > busiest->group_type)
9470 goto out_balanced;
9471
9472
9473
9474
9475
9476 if (local->group_type == group_overloaded) {
9477
9478
9479
9480
9481 if (local->avg_load >= busiest->avg_load)
9482 goto out_balanced;
9483
9484
9485 sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
9486 sds.total_capacity;
9487
9488
9489
9490
9491
9492 if (local->avg_load >= sds.avg_load)
9493 goto out_balanced;
9494
9495
9496
9497
9498
9499 if (100 * busiest->avg_load <=
9500 env->sd->imbalance_pct * local->avg_load)
9501 goto out_balanced;
9502 }
9503
9504
9505 if (sds.prefer_sibling && local->group_type == group_has_spare &&
9506 busiest->sum_nr_running > local->sum_nr_running + 1)
9507 goto force_balance;
9508
9509 if (busiest->group_type != group_overloaded) {
9510 if (env->idle == CPU_NOT_IDLE)
9511
9512
9513
9514
9515
9516 goto out_balanced;
9517
9518 if (busiest->group_weight > 1 &&
9519 local->idle_cpus <= (busiest->idle_cpus + 1))
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529 goto out_balanced;
9530
9531 if (busiest->sum_h_nr_running == 1)
9532
9533
9534
9535 goto out_balanced;
9536 }
9537
9538force_balance:
9539
9540 calculate_imbalance(env, &sds);
9541 return env->imbalance ? sds.busiest : NULL;
9542
9543out_balanced:
9544 env->imbalance = 0;
9545 return NULL;
9546}
9547
9548
9549
9550
9551static struct rq *find_busiest_queue(struct lb_env *env,
9552 struct sched_group *group)
9553{
9554 struct rq *busiest = NULL, *rq;
9555 unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
9556 unsigned int busiest_nr = 0;
9557 int i;
9558
9559 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
9560 unsigned long capacity, load, util;
9561 unsigned int nr_running;
9562 enum fbq_type rt;
9563
9564 rq = cpu_rq(i);
9565 rt = fbq_classify_rq(rq);
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586 if (rt > env->fbq_type)
9587 continue;
9588
9589 nr_running = rq->cfs.h_nr_running;
9590 if (!nr_running)
9591 continue;
9592
9593 capacity = capacity_of(i);
9594
9595
9596
9597
9598
9599
9600
9601 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
9602 !capacity_greater(capacity_of(env->dst_cpu), capacity) &&
9603 nr_running == 1)
9604 continue;
9605
9606 switch (env->migration_type) {
9607 case migrate_load:
9608
9609
9610
9611
9612 load = cpu_load(rq);
9613
9614 if (nr_running == 1 && load > env->imbalance &&
9615 !check_cpu_capacity(rq, env->sd))
9616 break;
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631 if (load * busiest_capacity > busiest_load * capacity) {
9632 busiest_load = load;
9633 busiest_capacity = capacity;
9634 busiest = rq;
9635 }
9636 break;
9637
9638 case migrate_util:
9639 util = cpu_util(cpu_of(rq));
9640
9641
9642
9643
9644
9645
9646 if (nr_running <= 1)
9647 continue;
9648
9649 if (busiest_util < util) {
9650 busiest_util = util;
9651 busiest = rq;
9652 }
9653 break;
9654
9655 case migrate_task:
9656 if (busiest_nr < nr_running) {
9657 busiest_nr = nr_running;
9658 busiest = rq;
9659 }
9660 break;
9661
9662 case migrate_misfit:
9663
9664
9665
9666
9667 if (rq->misfit_task_load > busiest_load) {
9668 busiest_load = rq->misfit_task_load;
9669 busiest = rq;
9670 }
9671
9672 break;
9673
9674 }
9675 }
9676
9677 return busiest;
9678}
9679
9680
9681
9682
9683
9684#define MAX_PINNED_INTERVAL 512
9685
9686static inline bool
9687asym_active_balance(struct lb_env *env)
9688{
9689
9690
9691
9692
9693
9694 return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
9695 sched_asym_prefer(env->dst_cpu, env->src_cpu);
9696}
9697
9698static inline bool
9699imbalanced_active_balance(struct lb_env *env)
9700{
9701 struct sched_domain *sd = env->sd;
9702
9703
9704
9705
9706
9707
9708 if ((env->migration_type == migrate_task) &&
9709 (sd->nr_balance_failed > sd->cache_nice_tries+2))
9710 return 1;
9711
9712 return 0;
9713}
9714
9715static int need_active_balance(struct lb_env *env)
9716{
9717 struct sched_domain *sd = env->sd;
9718
9719 if (asym_active_balance(env))
9720 return 1;
9721
9722 if (imbalanced_active_balance(env))
9723 return 1;
9724
9725
9726
9727
9728
9729
9730
9731 if ((env->idle != CPU_NOT_IDLE) &&
9732 (env->src_rq->cfs.h_nr_running == 1)) {
9733 if ((check_cpu_capacity(env->src_rq, sd)) &&
9734 (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
9735 return 1;
9736 }
9737
9738 if (env->migration_type == migrate_misfit)
9739 return 1;
9740
9741 return 0;
9742}
9743
9744static int active_load_balance_cpu_stop(void *data);
9745
9746static int should_we_balance(struct lb_env *env)
9747{
9748 struct sched_group *sg = env->sd->groups;
9749 int cpu;
9750
9751
9752
9753
9754
9755 if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
9756 return 0;
9757
9758
9759
9760
9761
9762 if (env->idle == CPU_NEWLY_IDLE)
9763 return 1;
9764
9765
9766 for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
9767 if (!idle_cpu(cpu))
9768 continue;
9769
9770
9771 return cpu == env->dst_cpu;
9772 }
9773
9774
9775 return group_balance_cpu(sg) == env->dst_cpu;
9776}
9777
9778
9779
9780
9781
9782static int load_balance(int this_cpu, struct rq *this_rq,
9783 struct sched_domain *sd, enum cpu_idle_type idle,
9784 int *continue_balancing)
9785{
9786 int ld_moved, cur_ld_moved, active_balance = 0;
9787 struct sched_domain *sd_parent = sd->parent;
9788 struct sched_group *group;
9789 struct rq *busiest;
9790 struct rq_flags rf;
9791 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
9792
9793 struct lb_env env = {
9794 .sd = sd,
9795 .dst_cpu = this_cpu,
9796 .dst_rq = this_rq,
9797 .dst_grpmask = sched_group_span(sd->groups),
9798 .idle = idle,
9799 .loop_break = sched_nr_migrate_break,
9800 .cpus = cpus,
9801 .fbq_type = all,
9802 .tasks = LIST_HEAD_INIT(env.tasks),
9803 };
9804
9805 cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
9806
9807 schedstat_inc(sd->lb_count[idle]);
9808
9809redo:
9810 if (!should_we_balance(&env)) {
9811 *continue_balancing = 0;
9812 goto out_balanced;
9813 }
9814
9815 group = find_busiest_group(&env);
9816 if (!group) {
9817 schedstat_inc(sd->lb_nobusyg[idle]);
9818 goto out_balanced;
9819 }
9820
9821 busiest = find_busiest_queue(&env, group);
9822 if (!busiest) {
9823 schedstat_inc(sd->lb_nobusyq[idle]);
9824 goto out_balanced;
9825 }
9826
9827 BUG_ON(busiest == env.dst_rq);
9828
9829 schedstat_add(sd->lb_imbalance[idle], env.imbalance);
9830
9831 env.src_cpu = busiest->cpu;
9832 env.src_rq = busiest;
9833
9834 ld_moved = 0;
9835
9836 env.flags |= LBF_ALL_PINNED;
9837 if (busiest->nr_running > 1) {
9838
9839
9840
9841
9842
9843
9844 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
9845
9846more_balance:
9847 rq_lock_irqsave(busiest, &rf);
9848 update_rq_clock(busiest);
9849
9850
9851
9852
9853
9854 cur_ld_moved = detach_tasks(&env);
9855
9856
9857
9858
9859
9860
9861
9862
9863
9864 rq_unlock(busiest, &rf);
9865
9866 if (cur_ld_moved) {
9867 attach_tasks(&env);
9868 ld_moved += cur_ld_moved;
9869 }
9870
9871 local_irq_restore(rf.flags);
9872
9873 if (env.flags & LBF_NEED_BREAK) {
9874 env.flags &= ~LBF_NEED_BREAK;
9875 goto more_balance;
9876 }
9877
9878
9879
9880
9881
9882
9883
9884
9885
9886
9887
9888
9889
9890
9891
9892
9893
9894
9895
9896
9897 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
9898
9899
9900 __cpumask_clear_cpu(env.dst_cpu, env.cpus);
9901
9902 env.dst_rq = cpu_rq(env.new_dst_cpu);
9903 env.dst_cpu = env.new_dst_cpu;
9904 env.flags &= ~LBF_DST_PINNED;
9905 env.loop = 0;
9906 env.loop_break = sched_nr_migrate_break;
9907
9908
9909
9910
9911
9912 goto more_balance;
9913 }
9914
9915
9916
9917
9918 if (sd_parent) {
9919 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
9920
9921 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
9922 *group_imbalance = 1;
9923 }
9924
9925
9926 if (unlikely(env.flags & LBF_ALL_PINNED)) {
9927 __cpumask_clear_cpu(cpu_of(busiest), cpus);
9928
9929
9930
9931
9932
9933
9934
9935
9936 if (!cpumask_subset(cpus, env.dst_grpmask)) {
9937 env.loop = 0;
9938 env.loop_break = sched_nr_migrate_break;
9939 goto redo;
9940 }
9941 goto out_all_pinned;
9942 }
9943 }
9944
9945 if (!ld_moved) {
9946 schedstat_inc(sd->lb_failed[idle]);
9947
9948
9949
9950
9951
9952
9953 if (idle != CPU_NEWLY_IDLE)
9954 sd->nr_balance_failed++;
9955
9956 if (need_active_balance(&env)) {
9957 unsigned long flags;
9958
9959 raw_spin_rq_lock_irqsave(busiest, flags);
9960
9961
9962
9963
9964
9965
9966 if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
9967 raw_spin_rq_unlock_irqrestore(busiest, flags);
9968 goto out_one_pinned;
9969 }
9970
9971
9972 env.flags &= ~LBF_ALL_PINNED;
9973
9974
9975
9976
9977
9978
9979 if (!busiest->active_balance) {
9980 busiest->active_balance = 1;
9981 busiest->push_cpu = this_cpu;
9982 active_balance = 1;
9983 }
9984 raw_spin_rq_unlock_irqrestore(busiest, flags);
9985
9986 if (active_balance) {
9987 stop_one_cpu_nowait(cpu_of(busiest),
9988 active_load_balance_cpu_stop, busiest,
9989 &busiest->active_balance_work);
9990 }
9991 }
9992 } else {
9993 sd->nr_balance_failed = 0;
9994 }
9995
9996 if (likely(!active_balance) || need_active_balance(&env)) {
9997
9998 sd->balance_interval = sd->min_interval;
9999 }
10000
10001 goto out;
10002
10003out_balanced:
10004
10005
10006
10007
10008
10009 if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
10010 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
10011
10012 if (*group_imbalance)
10013 *group_imbalance = 0;
10014 }
10015
10016out_all_pinned:
10017
10018
10019
10020
10021
10022 schedstat_inc(sd->lb_balanced[idle]);
10023
10024 sd->nr_balance_failed = 0;
10025
10026out_one_pinned:
10027 ld_moved = 0;
10028
10029
10030
10031
10032
10033
10034
10035 if (env.idle == CPU_NEWLY_IDLE)
10036 goto out;
10037
10038
10039 if ((env.flags & LBF_ALL_PINNED &&
10040 sd->balance_interval < MAX_PINNED_INTERVAL) ||
10041 sd->balance_interval < sd->max_interval)
10042 sd->balance_interval *= 2;
10043out:
10044 return ld_moved;
10045}
10046
10047static inline unsigned long
10048get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
10049{
10050 unsigned long interval = sd->balance_interval;
10051
10052 if (cpu_busy)
10053 interval *= sd->busy_factor;
10054
10055
10056 interval = msecs_to_jiffies(interval);
10057
10058
10059
10060
10061
10062
10063 if (cpu_busy)
10064 interval -= 1;
10065
10066 interval = clamp(interval, 1UL, max_load_balance_interval);
10067
10068 return interval;
10069}
10070
10071static inline void
10072update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
10073{
10074 unsigned long interval, next;
10075
10076
10077 interval = get_sd_balance_interval(sd, 0);
10078 next = sd->last_balance + interval;
10079
10080 if (time_after(*next_balance, next))
10081 *next_balance = next;
10082}
10083
10084
10085
10086
10087
10088
10089
10090static int active_load_balance_cpu_stop(void *data)
10091{
10092 struct rq *busiest_rq = data;
10093 int busiest_cpu = cpu_of(busiest_rq);
10094 int target_cpu = busiest_rq->push_cpu;
10095 struct rq *target_rq = cpu_rq(target_cpu);
10096 struct sched_domain *sd;
10097 struct task_struct *p = NULL;
10098 struct rq_flags rf;
10099
10100 rq_lock_irq(busiest_rq, &rf);
10101
10102
10103
10104
10105
10106 if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
10107 goto out_unlock;
10108
10109
10110 if (unlikely(busiest_cpu != smp_processor_id() ||
10111 !busiest_rq->active_balance))
10112 goto out_unlock;
10113
10114
10115 if (busiest_rq->nr_running <= 1)
10116 goto out_unlock;
10117
10118
10119
10120
10121
10122
10123 BUG_ON(busiest_rq == target_rq);
10124
10125
10126 rcu_read_lock();
10127 for_each_domain(target_cpu, sd) {
10128 if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
10129 break;
10130 }
10131
10132 if (likely(sd)) {
10133 struct lb_env env = {
10134 .sd = sd,
10135 .dst_cpu = target_cpu,
10136 .dst_rq = target_rq,
10137 .src_cpu = busiest_rq->cpu,
10138 .src_rq = busiest_rq,
10139 .idle = CPU_IDLE,
10140 .flags = LBF_ACTIVE_LB,
10141 };
10142
10143 schedstat_inc(sd->alb_count);
10144 update_rq_clock(busiest_rq);
10145
10146 p = detach_one_task(&env);
10147 if (p) {
10148 schedstat_inc(sd->alb_pushed);
10149
10150 sd->nr_balance_failed = 0;
10151 } else {
10152 schedstat_inc(sd->alb_failed);
10153 }
10154 }
10155 rcu_read_unlock();
10156out_unlock:
10157 busiest_rq->active_balance = 0;
10158 rq_unlock(busiest_rq, &rf);
10159
10160 if (p)
10161 attach_one_task(target_rq, p);
10162
10163 local_irq_enable();
10164
10165 return 0;
10166}
10167
10168static DEFINE_SPINLOCK(balancing);
10169
10170
10171
10172
10173
10174void update_max_interval(void)
10175{
10176 max_load_balance_interval = HZ*num_online_cpus()/10;
10177}
10178
10179
10180
10181
10182
10183
10184
10185static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
10186{
10187 int continue_balancing = 1;
10188 int cpu = rq->cpu;
10189 int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
10190 unsigned long interval;
10191 struct sched_domain *sd;
10192
10193 unsigned long next_balance = jiffies + 60*HZ;
10194 int update_next_balance = 0;
10195 int need_serialize, need_decay = 0;
10196 u64 max_cost = 0;
10197
10198 rcu_read_lock();
10199 for_each_domain(cpu, sd) {
10200
10201
10202
10203
10204 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
10205 sd->max_newidle_lb_cost =
10206 (sd->max_newidle_lb_cost * 253) / 256;
10207 sd->next_decay_max_lb_cost = jiffies + HZ;
10208 need_decay = 1;
10209 }
10210 max_cost += sd->max_newidle_lb_cost;
10211
10212
10213
10214
10215
10216
10217 if (!continue_balancing) {
10218 if (need_decay)
10219 continue;
10220 break;
10221 }
10222
10223 interval = get_sd_balance_interval(sd, busy);
10224
10225 need_serialize = sd->flags & SD_SERIALIZE;
10226 if (need_serialize) {
10227 if (!spin_trylock(&balancing))
10228 goto out;
10229 }
10230
10231 if (time_after_eq(jiffies, sd->last_balance + interval)) {
10232 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
10233
10234
10235
10236
10237
10238 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
10239 busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
10240 }
10241 sd->last_balance = jiffies;
10242 interval = get_sd_balance_interval(sd, busy);
10243 }
10244 if (need_serialize)
10245 spin_unlock(&balancing);
10246out:
10247 if (time_after(next_balance, sd->last_balance + interval)) {
10248 next_balance = sd->last_balance + interval;
10249 update_next_balance = 1;
10250 }
10251 }
10252 if (need_decay) {
10253
10254
10255
10256
10257 rq->max_idle_balance_cost =
10258 max((u64)sysctl_sched_migration_cost, max_cost);
10259 }
10260 rcu_read_unlock();
10261
10262
10263
10264
10265
10266
10267 if (likely(update_next_balance))
10268 rq->next_balance = next_balance;
10269
10270}
10271
10272static inline int on_null_domain(struct rq *rq)
10273{
10274 return unlikely(!rcu_dereference_sched(rq->sd));
10275}
10276
10277#ifdef CONFIG_NO_HZ_COMMON
10278
10279
10280
10281
10282
10283
10284
10285
10286
10287static inline int find_new_ilb(void)
10288{
10289 int ilb;
10290 const struct cpumask *hk_mask;
10291
10292 hk_mask = housekeeping_cpumask(HK_FLAG_MISC);
10293
10294 for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) {
10295
10296 if (ilb == smp_processor_id())
10297 continue;
10298
10299 if (idle_cpu(ilb))
10300 return ilb;
10301 }
10302
10303 return nr_cpu_ids;
10304}
10305
10306
10307
10308
10309
10310static void kick_ilb(unsigned int flags)
10311{
10312 int ilb_cpu;
10313
10314
10315
10316
10317
10318 if (flags & NOHZ_BALANCE_KICK)
10319 nohz.next_balance = jiffies+1;
10320
10321 ilb_cpu = find_new_ilb();
10322
10323 if (ilb_cpu >= nr_cpu_ids)
10324 return;
10325
10326
10327
10328
10329
10330 flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
10331 if (flags & NOHZ_KICK_MASK)
10332 return;
10333
10334
10335
10336
10337
10338
10339 smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
10340}
10341
10342
10343
10344
10345
10346static void nohz_balancer_kick(struct rq *rq)
10347{
10348 unsigned long now = jiffies;
10349 struct sched_domain_shared *sds;
10350 struct sched_domain *sd;
10351 int nr_busy, i, cpu = rq->cpu;
10352 unsigned int flags = 0;
10353
10354 if (unlikely(rq->idle_balance))
10355 return;
10356
10357
10358
10359
10360
10361 nohz_balance_exit_idle(rq);
10362
10363
10364
10365
10366
10367 if (likely(!atomic_read(&nohz.nr_cpus)))
10368 return;
10369
10370 if (READ_ONCE(nohz.has_blocked) &&
10371 time_after(now, READ_ONCE(nohz.next_blocked)))
10372 flags = NOHZ_STATS_KICK;
10373
10374 if (time_before(now, nohz.next_balance))
10375 goto out;
10376
10377 if (rq->nr_running >= 2) {
10378 flags = NOHZ_KICK_MASK;
10379 goto out;
10380 }
10381
10382 rcu_read_lock();
10383
10384 sd = rcu_dereference(rq->sd);
10385 if (sd) {
10386
10387
10388
10389
10390
10391 if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
10392 flags = NOHZ_KICK_MASK;
10393 goto unlock;
10394 }
10395 }
10396
10397 sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
10398 if (sd) {
10399
10400
10401
10402
10403
10404 for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
10405 if (sched_asym_prefer(i, cpu)) {
10406 flags = NOHZ_KICK_MASK;
10407 goto unlock;
10408 }
10409 }
10410 }
10411
10412 sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
10413 if (sd) {
10414
10415
10416
10417
10418 if (check_misfit_status(rq, sd)) {
10419 flags = NOHZ_KICK_MASK;
10420 goto unlock;
10421 }
10422
10423
10424
10425
10426
10427
10428
10429
10430 goto unlock;
10431 }
10432
10433 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
10434 if (sds) {
10435
10436
10437
10438
10439
10440
10441
10442
10443
10444 nr_busy = atomic_read(&sds->nr_busy_cpus);
10445 if (nr_busy > 1) {
10446 flags = NOHZ_KICK_MASK;
10447 goto unlock;
10448 }
10449 }
10450unlock:
10451 rcu_read_unlock();
10452out:
10453 if (flags)
10454 kick_ilb(flags);
10455}
10456
10457static void set_cpu_sd_state_busy(int cpu)
10458{
10459 struct sched_domain *sd;
10460
10461 rcu_read_lock();
10462 sd = rcu_dereference(per_cpu(sd_llc, cpu));
10463
10464 if (!sd || !sd->nohz_idle)
10465 goto unlock;
10466 sd->nohz_idle = 0;
10467
10468 atomic_inc(&sd->shared->nr_busy_cpus);
10469unlock:
10470 rcu_read_unlock();
10471}
10472
10473void nohz_balance_exit_idle(struct rq *rq)
10474{
10475 SCHED_WARN_ON(rq != this_rq());
10476
10477 if (likely(!rq->nohz_tick_stopped))
10478 return;
10479
10480 rq->nohz_tick_stopped = 0;
10481 cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
10482 atomic_dec(&nohz.nr_cpus);
10483
10484 set_cpu_sd_state_busy(rq->cpu);
10485}
10486
10487static void set_cpu_sd_state_idle(int cpu)
10488{
10489 struct sched_domain *sd;
10490
10491 rcu_read_lock();
10492 sd = rcu_dereference(per_cpu(sd_llc, cpu));
10493
10494 if (!sd || sd->nohz_idle)
10495 goto unlock;
10496 sd->nohz_idle = 1;
10497
10498 atomic_dec(&sd->shared->nr_busy_cpus);
10499unlock:
10500 rcu_read_unlock();
10501}
10502
10503
10504
10505
10506
10507void nohz_balance_enter_idle(int cpu)
10508{
10509 struct rq *rq = cpu_rq(cpu);
10510
10511 SCHED_WARN_ON(cpu != smp_processor_id());
10512
10513
10514 if (!cpu_active(cpu))
10515 return;
10516
10517
10518 if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
10519 return;
10520
10521
10522
10523
10524
10525
10526 rq->has_blocked_load = 1;
10527
10528
10529
10530
10531
10532
10533
10534 if (rq->nohz_tick_stopped)
10535 goto out;
10536
10537
10538 if (on_null_domain(rq))
10539 return;
10540
10541 rq->nohz_tick_stopped = 1;
10542
10543 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
10544 atomic_inc(&nohz.nr_cpus);
10545
10546
10547
10548
10549
10550
10551 smp_mb__after_atomic();
10552
10553 set_cpu_sd_state_idle(cpu);
10554
10555out:
10556
10557
10558
10559
10560 WRITE_ONCE(nohz.has_blocked, 1);
10561}
10562
10563static bool update_nohz_stats(struct rq *rq)
10564{
10565 unsigned int cpu = rq->cpu;
10566
10567 if (!rq->has_blocked_load)
10568 return false;
10569
10570 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
10571 return false;
10572
10573 if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
10574 return true;
10575
10576 update_blocked_averages(cpu);
10577
10578 return rq->has_blocked_load;
10579}
10580
10581
10582
10583
10584
10585
10586static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
10587 enum cpu_idle_type idle)
10588{
10589
10590 unsigned long now = jiffies;
10591 unsigned long next_balance = now + 60*HZ;
10592 bool has_blocked_load = false;
10593 int update_next_balance = 0;
10594 int this_cpu = this_rq->cpu;
10595 int balance_cpu;
10596 struct rq *rq;
10597
10598 SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
10599
10600
10601
10602
10603
10604
10605
10606
10607
10608 WRITE_ONCE(nohz.has_blocked, 0);
10609
10610
10611
10612
10613
10614 smp_mb();
10615
10616
10617
10618
10619
10620 for_each_cpu_wrap(balance_cpu, nohz.idle_cpus_mask, this_cpu+1) {
10621 if (!idle_cpu(balance_cpu))
10622 continue;
10623
10624
10625
10626
10627
10628
10629 if (need_resched()) {
10630 has_blocked_load = true;
10631 goto abort;
10632 }
10633
10634 rq = cpu_rq(balance_cpu);
10635
10636 has_blocked_load |= update_nohz_stats(rq);
10637
10638
10639
10640
10641
10642 if (time_after_eq(jiffies, rq->next_balance)) {
10643 struct rq_flags rf;
10644
10645 rq_lock_irqsave(rq, &rf);
10646 update_rq_clock(rq);
10647 rq_unlock_irqrestore(rq, &rf);
10648
10649 if (flags & NOHZ_BALANCE_KICK)
10650 rebalance_domains(rq, CPU_IDLE);
10651 }
10652
10653 if (time_after(next_balance, rq->next_balance)) {
10654 next_balance = rq->next_balance;
10655 update_next_balance = 1;
10656 }
10657 }
10658
10659
10660
10661
10662
10663
10664 if (likely(update_next_balance))
10665 nohz.next_balance = next_balance;
10666
10667 WRITE_ONCE(nohz.next_blocked,
10668 now + msecs_to_jiffies(LOAD_AVG_PERIOD));
10669
10670abort:
10671
10672 if (has_blocked_load)
10673 WRITE_ONCE(nohz.has_blocked, 1);
10674}
10675
10676
10677
10678
10679
10680static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
10681{
10682 unsigned int flags = this_rq->nohz_idle_balance;
10683
10684 if (!flags)
10685 return false;
10686
10687 this_rq->nohz_idle_balance = 0;
10688
10689 if (idle != CPU_IDLE)
10690 return false;
10691
10692 _nohz_idle_balance(this_rq, flags, idle);
10693
10694 return true;
10695}
10696
10697
10698
10699
10700
10701void nohz_run_idle_balance(int cpu)
10702{
10703 unsigned int flags;
10704
10705 flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu));
10706
10707
10708
10709
10710
10711 if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
10712 _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK, CPU_IDLE);
10713}
10714
10715static void nohz_newidle_balance(struct rq *this_rq)
10716{
10717 int this_cpu = this_rq->cpu;
10718
10719
10720
10721
10722
10723 if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
10724 return;
10725
10726
10727 if (this_rq->avg_idle < sysctl_sched_migration_cost)
10728 return;
10729
10730
10731 if (!READ_ONCE(nohz.has_blocked) ||
10732 time_before(jiffies, READ_ONCE(nohz.next_blocked)))
10733 return;
10734
10735
10736
10737
10738
10739 atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
10740}
10741
10742#else
10743static inline void nohz_balancer_kick(struct rq *rq) { }
10744
10745static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
10746{
10747 return false;
10748}
10749
10750static inline void nohz_newidle_balance(struct rq *this_rq) { }
10751#endif
10752
10753
10754
10755
10756
10757
10758
10759
10760
10761
10762static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
10763{
10764 unsigned long next_balance = jiffies + HZ;
10765 int this_cpu = this_rq->cpu;
10766 struct sched_domain *sd;
10767 int pulled_task = 0;
10768 u64 curr_cost = 0;
10769
10770 update_misfit_status(NULL, this_rq);
10771
10772
10773
10774
10775
10776 if (this_rq->ttwu_pending)
10777 return 0;
10778
10779
10780
10781
10782
10783 this_rq->idle_stamp = rq_clock(this_rq);
10784
10785
10786
10787
10788 if (!cpu_active(this_cpu))
10789 return 0;
10790
10791
10792
10793
10794
10795
10796
10797 rq_unpin_lock(this_rq, rf);
10798
10799 if (this_rq->avg_idle < sysctl_sched_migration_cost ||
10800 !READ_ONCE(this_rq->rd->overload)) {
10801
10802 rcu_read_lock();
10803 sd = rcu_dereference_check_sched_domain(this_rq->sd);
10804 if (sd)
10805 update_next_balance(sd, &next_balance);
10806 rcu_read_unlock();
10807
10808 goto out;
10809 }
10810
10811 raw_spin_rq_unlock(this_rq);
10812
10813 update_blocked_averages(this_cpu);
10814 rcu_read_lock();
10815 for_each_domain(this_cpu, sd) {
10816 int continue_balancing = 1;
10817 u64 t0, domain_cost;
10818
10819 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
10820 update_next_balance(sd, &next_balance);
10821 break;
10822 }
10823
10824 if (sd->flags & SD_BALANCE_NEWIDLE) {
10825 t0 = sched_clock_cpu(this_cpu);
10826
10827 pulled_task = load_balance(this_cpu, this_rq,
10828 sd, CPU_NEWLY_IDLE,
10829 &continue_balancing);
10830
10831 domain_cost = sched_clock_cpu(this_cpu) - t0;
10832 if (domain_cost > sd->max_newidle_lb_cost)
10833 sd->max_newidle_lb_cost = domain_cost;
10834
10835 curr_cost += domain_cost;
10836 }
10837
10838 update_next_balance(sd, &next_balance);
10839
10840
10841
10842
10843
10844 if (pulled_task || this_rq->nr_running > 0 ||
10845 this_rq->ttwu_pending)
10846 break;
10847 }
10848 rcu_read_unlock();
10849
10850 raw_spin_rq_lock(this_rq);
10851
10852 if (curr_cost > this_rq->max_idle_balance_cost)
10853 this_rq->max_idle_balance_cost = curr_cost;
10854
10855
10856
10857
10858
10859
10860 if (this_rq->cfs.h_nr_running && !pulled_task)
10861 pulled_task = 1;
10862
10863
10864 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
10865 pulled_task = -1;
10866
10867out:
10868
10869 if (time_after(this_rq->next_balance, next_balance))
10870 this_rq->next_balance = next_balance;
10871
10872 if (pulled_task)
10873 this_rq->idle_stamp = 0;
10874 else
10875 nohz_newidle_balance(this_rq);
10876
10877 rq_repin_lock(this_rq, rf);
10878
10879 return pulled_task;
10880}
10881
10882
10883
10884
10885
10886static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
10887{
10888 struct rq *this_rq = this_rq();
10889 enum cpu_idle_type idle = this_rq->idle_balance ?
10890 CPU_IDLE : CPU_NOT_IDLE;
10891
10892
10893
10894
10895
10896
10897
10898
10899
10900 if (nohz_idle_balance(this_rq, idle))
10901 return;
10902
10903
10904 update_blocked_averages(this_rq->cpu);
10905 rebalance_domains(this_rq, idle);
10906}
10907
10908
10909
10910
10911void trigger_load_balance(struct rq *rq)
10912{
10913
10914
10915
10916
10917 if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq))))
10918 return;
10919
10920 if (time_after_eq(jiffies, rq->next_balance))
10921 raise_softirq(SCHED_SOFTIRQ);
10922
10923 nohz_balancer_kick(rq);
10924}
10925
10926static void rq_online_fair(struct rq *rq)
10927{
10928 update_sysctl();
10929
10930 update_runtime_enabled(rq);
10931}
10932
10933static void rq_offline_fair(struct rq *rq)
10934{
10935 update_sysctl();
10936
10937
10938 unthrottle_offline_cfs_rqs(rq);
10939}
10940
10941#endif
10942
10943#ifdef CONFIG_SCHED_CORE
10944static inline bool
10945__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
10946{
10947 u64 slice = sched_slice(cfs_rq_of(se), se);
10948 u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
10949
10950 return (rtime * min_nr_tasks > slice);
10951}
10952
10953#define MIN_NR_TASKS_DURING_FORCEIDLE 2
10954static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
10955{
10956 if (!sched_core_enabled(rq))
10957 return;
10958
10959
10960
10961
10962
10963
10964
10965
10966
10967
10968
10969
10970
10971
10972
10973 if (rq->core->core_forceidle && rq->cfs.nr_running == 1 &&
10974 __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
10975 resched_curr(rq);
10976}
10977
10978
10979
10980
10981static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle)
10982{
10983 for_each_sched_entity(se) {
10984 struct cfs_rq *cfs_rq = cfs_rq_of(se);
10985
10986 if (forceidle) {
10987 if (cfs_rq->forceidle_seq == fi_seq)
10988 break;
10989 cfs_rq->forceidle_seq = fi_seq;
10990 }
10991
10992 cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
10993 }
10994}
10995
10996void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi)
10997{
10998 struct sched_entity *se = &p->se;
10999
11000 if (p->sched_class != &fair_sched_class)
11001 return;
11002
11003 se_fi_update(se, rq->core->core_forceidle_seq, in_fi);
11004}
11005
11006bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
11007{
11008 struct rq *rq = task_rq(a);
11009 struct sched_entity *sea = &a->se;
11010 struct sched_entity *seb = &b->se;
11011 struct cfs_rq *cfs_rqa;
11012 struct cfs_rq *cfs_rqb;
11013 s64 delta;
11014
11015 SCHED_WARN_ON(task_rq(b)->core != rq->core);
11016
11017#ifdef CONFIG_FAIR_GROUP_SCHED
11018
11019
11020
11021
11022 while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
11023 int sea_depth = sea->depth;
11024 int seb_depth = seb->depth;
11025
11026 if (sea_depth >= seb_depth)
11027 sea = parent_entity(sea);
11028 if (sea_depth <= seb_depth)
11029 seb = parent_entity(seb);
11030 }
11031
11032 se_fi_update(sea, rq->core->core_forceidle_seq, in_fi);
11033 se_fi_update(seb, rq->core->core_forceidle_seq, in_fi);
11034
11035 cfs_rqa = sea->cfs_rq;
11036 cfs_rqb = seb->cfs_rq;
11037#else
11038 cfs_rqa = &task_rq(a)->cfs;
11039 cfs_rqb = &task_rq(b)->cfs;
11040#endif
11041
11042
11043
11044
11045
11046
11047 delta = (s64)(sea->vruntime - seb->vruntime) +
11048 (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
11049
11050 return delta > 0;
11051}
11052#else
11053static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
11054#endif
11055
11056
11057
11058
11059
11060
11061
11062
11063
11064static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
11065{
11066 struct cfs_rq *cfs_rq;
11067 struct sched_entity *se = &curr->se;
11068
11069 for_each_sched_entity(se) {
11070 cfs_rq = cfs_rq_of(se);
11071 entity_tick(cfs_rq, se, queued);
11072 }
11073
11074 if (static_branch_unlikely(&sched_numa_balancing))
11075 task_tick_numa(rq, curr);
11076
11077 update_misfit_status(curr, rq);
11078 update_overutilized_status(task_rq(curr));
11079
11080 task_tick_core(rq, curr);
11081}
11082
11083
11084
11085
11086
11087
11088static void task_fork_fair(struct task_struct *p)
11089{
11090 struct cfs_rq *cfs_rq;
11091 struct sched_entity *se = &p->se, *curr;
11092 struct rq *rq = this_rq();
11093 struct rq_flags rf;
11094
11095 rq_lock(rq, &rf);
11096 update_rq_clock(rq);
11097
11098 cfs_rq = task_cfs_rq(current);
11099 curr = cfs_rq->curr;
11100 if (curr) {
11101 update_curr(cfs_rq);
11102 se->vruntime = curr->vruntime;
11103 }
11104 place_entity(cfs_rq, se, 1);
11105
11106 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
11107
11108
11109
11110
11111 swap(curr->vruntime, se->vruntime);
11112 resched_curr(rq);
11113 }
11114
11115 se->vruntime -= cfs_rq->min_vruntime;
11116 rq_unlock(rq, &rf);
11117}
11118
11119
11120
11121
11122
11123static void
11124prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
11125{
11126 if (!task_on_rq_queued(p))
11127 return;
11128
11129 if (rq->cfs.nr_running == 1)
11130 return;
11131
11132
11133
11134
11135
11136
11137 if (task_current(rq, p)) {
11138 if (p->prio > oldprio)
11139 resched_curr(rq);
11140 } else
11141 check_preempt_curr(rq, p, 0);
11142}
11143
11144static inline bool vruntime_normalized(struct task_struct *p)
11145{
11146 struct sched_entity *se = &p->se;
11147
11148
11149
11150
11151
11152
11153 if (p->on_rq)
11154 return true;
11155
11156
11157
11158
11159
11160
11161
11162
11163
11164
11165 if (!se->sum_exec_runtime ||
11166 (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup))
11167 return true;
11168
11169 return false;
11170}
11171
11172#ifdef CONFIG_FAIR_GROUP_SCHED
11173
11174
11175
11176
11177static void propagate_entity_cfs_rq(struct sched_entity *se)
11178{
11179 struct cfs_rq *cfs_rq;
11180
11181 list_add_leaf_cfs_rq(cfs_rq_of(se));
11182
11183
11184 se = se->parent;
11185
11186 for_each_sched_entity(se) {
11187 cfs_rq = cfs_rq_of(se);
11188
11189 if (!cfs_rq_throttled(cfs_rq)){
11190 update_load_avg(cfs_rq, se, UPDATE_TG);
11191 list_add_leaf_cfs_rq(cfs_rq);
11192 continue;
11193 }
11194
11195 if (list_add_leaf_cfs_rq(cfs_rq))
11196 break;
11197 }
11198}
11199#else
11200static void propagate_entity_cfs_rq(struct sched_entity *se) { }
11201#endif
11202
11203static void detach_entity_cfs_rq(struct sched_entity *se)
11204{
11205 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11206
11207
11208 update_load_avg(cfs_rq, se, 0);
11209 detach_entity_load_avg(cfs_rq, se);
11210 update_tg_load_avg(cfs_rq);
11211 propagate_entity_cfs_rq(se);
11212}
11213
11214static void attach_entity_cfs_rq(struct sched_entity *se)
11215{
11216 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11217
11218#ifdef CONFIG_FAIR_GROUP_SCHED
11219
11220
11221
11222
11223 se->depth = se->parent ? se->parent->depth + 1 : 0;
11224#endif
11225
11226
11227 update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
11228 attach_entity_load_avg(cfs_rq, se);
11229 update_tg_load_avg(cfs_rq);
11230 propagate_entity_cfs_rq(se);
11231}
11232
11233static void detach_task_cfs_rq(struct task_struct *p)
11234{
11235 struct sched_entity *se = &p->se;
11236 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11237
11238 if (!vruntime_normalized(p)) {
11239
11240
11241
11242
11243 place_entity(cfs_rq, se, 0);
11244 se->vruntime -= cfs_rq->min_vruntime;
11245 }
11246
11247 detach_entity_cfs_rq(se);
11248}
11249
11250static void attach_task_cfs_rq(struct task_struct *p)
11251{
11252 struct sched_entity *se = &p->se;
11253 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11254
11255 attach_entity_cfs_rq(se);
11256
11257 if (!vruntime_normalized(p))
11258 se->vruntime += cfs_rq->min_vruntime;
11259}
11260
11261static void switched_from_fair(struct rq *rq, struct task_struct *p)
11262{
11263 detach_task_cfs_rq(p);
11264}
11265
11266static void switched_to_fair(struct rq *rq, struct task_struct *p)
11267{
11268 attach_task_cfs_rq(p);
11269
11270 if (task_on_rq_queued(p)) {
11271
11272
11273
11274
11275
11276 if (task_current(rq, p))
11277 resched_curr(rq);
11278 else
11279 check_preempt_curr(rq, p, 0);
11280 }
11281}
11282
11283
11284
11285
11286
11287
11288static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
11289{
11290 struct sched_entity *se = &p->se;
11291
11292#ifdef CONFIG_SMP
11293 if (task_on_rq_queued(p)) {
11294
11295
11296
11297
11298 list_move(&se->group_node, &rq->cfs_tasks);
11299 }
11300#endif
11301
11302 for_each_sched_entity(se) {
11303 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11304
11305 set_next_entity(cfs_rq, se);
11306
11307 account_cfs_rq_runtime(cfs_rq, 0);
11308 }
11309}
11310
11311void init_cfs_rq(struct cfs_rq *cfs_rq)
11312{
11313 cfs_rq->tasks_timeline = RB_ROOT_CACHED;
11314 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
11315#ifndef CONFIG_64BIT
11316 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
11317#endif
11318#ifdef CONFIG_SMP
11319 raw_spin_lock_init(&cfs_rq->removed.lock);
11320#endif
11321}
11322
11323#ifdef CONFIG_FAIR_GROUP_SCHED
11324static void task_set_group_fair(struct task_struct *p)
11325{
11326 struct sched_entity *se = &p->se;
11327
11328 set_task_rq(p, task_cpu(p));
11329 se->depth = se->parent ? se->parent->depth + 1 : 0;
11330}
11331
11332static void task_move_group_fair(struct task_struct *p)
11333{
11334 detach_task_cfs_rq(p);
11335 set_task_rq(p, task_cpu(p));
11336
11337#ifdef CONFIG_SMP
11338
11339 p->se.avg.last_update_time = 0;
11340#endif
11341 attach_task_cfs_rq(p);
11342}
11343
11344static void task_change_group_fair(struct task_struct *p, int type)
11345{
11346 switch (type) {
11347 case TASK_SET_GROUP:
11348 task_set_group_fair(p);
11349 break;
11350
11351 case TASK_MOVE_GROUP:
11352 task_move_group_fair(p);
11353 break;
11354 }
11355}
11356
11357void free_fair_sched_group(struct task_group *tg)
11358{
11359 int i;
11360
11361 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
11362
11363 for_each_possible_cpu(i) {
11364 if (tg->cfs_rq)
11365 kfree(tg->cfs_rq[i]);
11366 if (tg->se)
11367 kfree(tg->se[i]);
11368 }
11369
11370 kfree(tg->cfs_rq);
11371 kfree(tg->se);
11372}
11373
11374int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
11375{
11376 struct sched_entity *se;
11377 struct cfs_rq *cfs_rq;
11378 int i;
11379
11380 tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
11381 if (!tg->cfs_rq)
11382 goto err;
11383 tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
11384 if (!tg->se)
11385 goto err;
11386
11387 tg->shares = NICE_0_LOAD;
11388
11389 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
11390
11391 for_each_possible_cpu(i) {
11392 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
11393 GFP_KERNEL, cpu_to_node(i));
11394 if (!cfs_rq)
11395 goto err;
11396
11397 se = kzalloc_node(sizeof(struct sched_entity),
11398 GFP_KERNEL, cpu_to_node(i));
11399 if (!se)
11400 goto err_free_rq;
11401
11402 init_cfs_rq(cfs_rq);
11403 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
11404 init_entity_runnable_average(se);
11405 }
11406
11407 return 1;
11408
11409err_free_rq:
11410 kfree(cfs_rq);
11411err:
11412 return 0;
11413}
11414
11415void online_fair_sched_group(struct task_group *tg)
11416{
11417 struct sched_entity *se;
11418 struct rq_flags rf;
11419 struct rq *rq;
11420 int i;
11421
11422 for_each_possible_cpu(i) {
11423 rq = cpu_rq(i);
11424 se = tg->se[i];
11425 rq_lock_irq(rq, &rf);
11426 update_rq_clock(rq);
11427 attach_entity_cfs_rq(se);
11428 sync_throttle(tg, i);
11429 rq_unlock_irq(rq, &rf);
11430 }
11431}
11432
11433void unregister_fair_sched_group(struct task_group *tg)
11434{
11435 unsigned long flags;
11436 struct rq *rq;
11437 int cpu;
11438
11439 for_each_possible_cpu(cpu) {
11440 if (tg->se[cpu])
11441 remove_entity_load_avg(tg->se[cpu]);
11442
11443
11444
11445
11446
11447 if (!tg->cfs_rq[cpu]->on_list)
11448 continue;
11449
11450 rq = cpu_rq(cpu);
11451
11452 raw_spin_rq_lock_irqsave(rq, flags);
11453 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
11454 raw_spin_rq_unlock_irqrestore(rq, flags);
11455 }
11456}
11457
11458void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
11459 struct sched_entity *se, int cpu,
11460 struct sched_entity *parent)
11461{
11462 struct rq *rq = cpu_rq(cpu);
11463
11464 cfs_rq->tg = tg;
11465 cfs_rq->rq = rq;
11466 init_cfs_rq_runtime(cfs_rq);
11467
11468 tg->cfs_rq[cpu] = cfs_rq;
11469 tg->se[cpu] = se;
11470
11471
11472 if (!se)
11473 return;
11474
11475 if (!parent) {
11476 se->cfs_rq = &rq->cfs;
11477 se->depth = 0;
11478 } else {
11479 se->cfs_rq = parent->my_q;
11480 se->depth = parent->depth + 1;
11481 }
11482
11483 se->my_q = cfs_rq;
11484
11485 update_load_set(&se->load, NICE_0_LOAD);
11486 se->parent = parent;
11487}
11488
11489static DEFINE_MUTEX(shares_mutex);
11490
11491static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
11492{
11493 int i;
11494
11495 lockdep_assert_held(&shares_mutex);
11496
11497
11498
11499
11500 if (!tg->se[0])
11501 return -EINVAL;
11502
11503 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
11504
11505 if (tg->shares == shares)
11506 return 0;
11507
11508 tg->shares = shares;
11509 for_each_possible_cpu(i) {
11510 struct rq *rq = cpu_rq(i);
11511 struct sched_entity *se = tg->se[i];
11512 struct rq_flags rf;
11513
11514
11515 rq_lock_irqsave(rq, &rf);
11516 update_rq_clock(rq);
11517 for_each_sched_entity(se) {
11518 update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
11519 update_cfs_group(se);
11520 }
11521 rq_unlock_irqrestore(rq, &rf);
11522 }
11523
11524 return 0;
11525}
11526
11527int sched_group_set_shares(struct task_group *tg, unsigned long shares)
11528{
11529 int ret;
11530
11531 mutex_lock(&shares_mutex);
11532 if (tg_is_idle(tg))
11533 ret = -EINVAL;
11534 else
11535 ret = __sched_group_set_shares(tg, shares);
11536 mutex_unlock(&shares_mutex);
11537
11538 return ret;
11539}
11540
11541int sched_group_set_idle(struct task_group *tg, long idle)
11542{
11543 int i;
11544
11545 if (tg == &root_task_group)
11546 return -EINVAL;
11547
11548 if (idle < 0 || idle > 1)
11549 return -EINVAL;
11550
11551 mutex_lock(&shares_mutex);
11552
11553 if (tg->idle == idle) {
11554 mutex_unlock(&shares_mutex);
11555 return 0;
11556 }
11557
11558 tg->idle = idle;
11559
11560 for_each_possible_cpu(i) {
11561 struct rq *rq = cpu_rq(i);
11562 struct sched_entity *se = tg->se[i];
11563 struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i];
11564 bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
11565 long idle_task_delta;
11566 struct rq_flags rf;
11567
11568 rq_lock_irqsave(rq, &rf);
11569
11570 grp_cfs_rq->idle = idle;
11571 if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
11572 goto next_cpu;
11573
11574 idle_task_delta = grp_cfs_rq->h_nr_running -
11575 grp_cfs_rq->idle_h_nr_running;
11576 if (!cfs_rq_is_idle(grp_cfs_rq))
11577 idle_task_delta *= -1;
11578
11579 for_each_sched_entity(se) {
11580 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11581
11582 if (!se->on_rq)
11583 break;
11584
11585 cfs_rq->idle_h_nr_running += idle_task_delta;
11586
11587
11588 if (cfs_rq_is_idle(cfs_rq))
11589 break;
11590 }
11591
11592next_cpu:
11593 rq_unlock_irqrestore(rq, &rf);
11594 }
11595
11596
11597 if (tg_is_idle(tg))
11598 __sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO));
11599 else
11600 __sched_group_set_shares(tg, NICE_0_LOAD);
11601
11602 mutex_unlock(&shares_mutex);
11603 return 0;
11604}
11605
11606#else
11607
11608void free_fair_sched_group(struct task_group *tg) { }
11609
11610int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
11611{
11612 return 1;
11613}
11614
11615void online_fair_sched_group(struct task_group *tg) { }
11616
11617void unregister_fair_sched_group(struct task_group *tg) { }
11618
11619#endif
11620
11621
11622static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
11623{
11624 struct sched_entity *se = &task->se;
11625 unsigned int rr_interval = 0;
11626
11627
11628
11629
11630
11631 if (rq->cfs.load.weight)
11632 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
11633
11634 return rr_interval;
11635}
11636
11637
11638
11639
11640DEFINE_SCHED_CLASS(fair) = {
11641
11642 .enqueue_task = enqueue_task_fair,
11643 .dequeue_task = dequeue_task_fair,
11644 .yield_task = yield_task_fair,
11645 .yield_to_task = yield_to_task_fair,
11646
11647 .check_preempt_curr = check_preempt_wakeup,
11648
11649 .pick_next_task = __pick_next_task_fair,
11650 .put_prev_task = put_prev_task_fair,
11651 .set_next_task = set_next_task_fair,
11652
11653#ifdef CONFIG_SMP
11654 .balance = balance_fair,
11655 .pick_task = pick_task_fair,
11656 .select_task_rq = select_task_rq_fair,
11657 .migrate_task_rq = migrate_task_rq_fair,
11658
11659 .rq_online = rq_online_fair,
11660 .rq_offline = rq_offline_fair,
11661
11662 .task_dead = task_dead_fair,
11663 .set_cpus_allowed = set_cpus_allowed_common,
11664#endif
11665
11666 .task_tick = task_tick_fair,
11667 .task_fork = task_fork_fair,
11668
11669 .prio_changed = prio_changed_fair,
11670 .switched_from = switched_from_fair,
11671 .switched_to = switched_to_fair,
11672
11673 .get_rr_interval = get_rr_interval_fair,
11674
11675 .update_curr = update_curr_fair,
11676
11677#ifdef CONFIG_FAIR_GROUP_SCHED
11678 .task_change_group = task_change_group_fair,
11679#endif
11680
11681#ifdef CONFIG_UCLAMP_TASK
11682 .uclamp_enabled = 1,
11683#endif
11684};
11685
11686#ifdef CONFIG_SCHED_DEBUG
11687void print_cfs_stats(struct seq_file *m, int cpu)
11688{
11689 struct cfs_rq *cfs_rq, *pos;
11690
11691 rcu_read_lock();
11692 for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
11693 print_cfs_rq(m, cpu, cfs_rq);
11694 rcu_read_unlock();
11695}
11696
11697#ifdef CONFIG_NUMA_BALANCING
11698void show_numa_stats(struct task_struct *p, struct seq_file *m)
11699{
11700 int node;
11701 unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
11702 struct numa_group *ng;
11703
11704 rcu_read_lock();
11705 ng = rcu_dereference(p->numa_group);
11706 for_each_online_node(node) {
11707 if (p->numa_faults) {
11708 tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
11709 tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
11710 }
11711 if (ng) {
11712 gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
11713 gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
11714 }
11715 print_numa_stats(m, node, tsf, tpf, gsf, gpf);
11716 }
11717 rcu_read_unlock();
11718}
11719#endif
11720#endif
11721
11722__init void init_sched_fair_class(void)
11723{
11724#ifdef CONFIG_SMP
11725 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
11726
11727#ifdef CONFIG_NO_HZ_COMMON
11728 nohz.next_balance = jiffies;
11729 nohz.next_blocked = jiffies;
11730 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
11731#endif
11732#endif
11733
11734}
11735
11736
11737
11738
11739
11740const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
11741{
11742#ifdef CONFIG_SMP
11743 return cfs_rq ? &cfs_rq->avg : NULL;
11744#else
11745 return NULL;
11746#endif
11747}
11748EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
11749
11750char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
11751{
11752 if (!cfs_rq) {
11753 if (str)
11754 strlcpy(str, "(null)", len);
11755 else
11756 return NULL;
11757 }
11758
11759 cfs_rq_tg_path(cfs_rq, str, len);
11760 return str;
11761}
11762EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
11763
11764int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
11765{
11766 return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
11767}
11768EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
11769
11770const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
11771{
11772#ifdef CONFIG_SMP
11773 return rq ? &rq->avg_rt : NULL;
11774#else
11775 return NULL;
11776#endif
11777}
11778EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
11779
11780const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
11781{
11782#ifdef CONFIG_SMP
11783 return rq ? &rq->avg_dl : NULL;
11784#else
11785 return NULL;
11786#endif
11787}
11788EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
11789
11790const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
11791{
11792#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
11793 return rq ? &rq->avg_irq : NULL;
11794#else
11795 return NULL;
11796#endif
11797}
11798EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
11799
11800int sched_trace_rq_cpu(struct rq *rq)
11801{
11802 return rq ? cpu_of(rq) : -1;
11803}
11804EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
11805
11806int sched_trace_rq_cpu_capacity(struct rq *rq)
11807{
11808 return rq ?
11809#ifdef CONFIG_SMP
11810 rq->cpu_capacity
11811#else
11812 SCHED_CAPACITY_SCALE
11813#endif
11814 : -1;
11815}
11816EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity);
11817
11818const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
11819{
11820#ifdef CONFIG_SMP
11821 return rd ? rd->span : NULL;
11822#else
11823 return NULL;
11824#endif
11825}
11826EXPORT_SYMBOL_GPL(sched_trace_rd_span);
11827
11828int sched_trace_rq_nr_running(struct rq *rq)
11829{
11830 return rq ? rq->nr_running : -1;
11831}
11832EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);
11833